def create_songdet(h5, sngidxfle): ''' Collects song details for all unique songs heard by 100 raters. Format of dictionary: { SongID : [ Att_0, Att_1, Att_2 ] } ''' import hdf5_getters sngdetfle = open("songdet.txt", "wb") sngdetdic = dict() sngidxfle = open(sngidxfle, "rb") sngidxdic = pickle.load(sngidxfle) for elem in sngidxdic: songidx = sngidxdic[elem] tempo = hdf5_getters.get_tempo(h5, songidx) loud = hdf5_getters.get_loudness(h5, songidx) year = hdf5_getters.get_year(h5, songidx) tmsig = hdf5_getters.get_time_signature(h5, songidx) key = hdf5_getters.get_key(h5, songidx) mode = hdf5_getters.get_mode(h5, songidx) duration = hdf5_getters.get_duration(h5, songidx) fadein = hdf5_getters.get_end_of_fade_in(h5, songidx) fadeout = hdf5_getters.get_start_of_fade_out(h5, songidx) artfam = hdf5_getters.get_artist_familiarity(h5, songidx) sngdetdic[elem] = [ duration, tmsig, tempo, key, mode, fadein, fadeout, year, loud, artfam ] pickle.dump(sngdetdic, sngdetfle) sngdetfle.close()
def create_songdet(h5, sngidxfle): ''' Collects song details for all unique songs heard by 100 raters. Format of dictionary: { SongID : [ Att_0, Att_1, Att_2 ] } ''' import hdf5_getters sngdetfle = open("songdet.txt", "wb") sngdetdic = dict() sngidxfle = open(sngidxfle, "rb") sngidxdic = pickle.load(sngidxfle) for elem in sngidxdic: songidx = sngidxdic[elem] tempo = hdf5_getters.get_tempo(h5, songidx) loud = hdf5_getters.get_loudness(h5, songidx) year = hdf5_getters.get_year(h5, songidx) tmsig = hdf5_getters.get_time_signature(h5, songidx) key = hdf5_getters.get_key(h5, songidx) mode = hdf5_getters.get_mode(h5, songidx) duration = hdf5_getters.get_duration(h5, songidx) fadein = hdf5_getters.get_end_of_fade_in(h5, songidx) fadeout = hdf5_getters.get_start_of_fade_out(h5, songidx) artfam = hdf5_getters.get_artist_familiarity(h5, songidx) sngdetdic[elem] = [duration, tmsig, tempo, key, mode, fadein, fadeout, year, loud, artfam] pickle.dump(sngdetdic, sngdetfle) sngdetfle.close()
def get_all_examples(basedir, genre_dict, ext='.h5'): """ From a base directory, goes through all subdirectories, and grabs all songs and their features and puts them into a pandas dataframe INPUT basedir - base directory of the dataset genre_dict - a dictionary mapping track id to genre based tagraum dataset ext - extension, .h5 by default RETURN dataframe containing all song examples """ features_vs_genre = pd.DataFrame() # iterate over all files in all subdirectories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) # # count files # count += len(files) # apply function to all files for f in files: h5 = GETTERS.open_h5_file_read(f) song_id = GETTERS.get_track_id(h5).decode('utf-8') if (song_id in genre_dict): genre = genre_dict[song_id] year = GETTERS.get_year(h5) duration = GETTERS.get_duration(h5) end_of_fade_in = GETTERS.get_end_of_fade_in(h5) loudness = GETTERS.get_loudness(h5) song_hotttnesss = GETTERS.get_song_hotttnesss(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) key_confidence = GETTERS.get_key_confidence(h5) mode = GETTERS.get_mode(h5) mode_confidence = GETTERS.get_mode_confidence(h5) time_signature = GETTERS.get_time_signature(h5) time_signature_confidence = GETTERS.get_time_signature_confidence( h5) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) # length of sections_start array gives us number of start num_sections = len(GETTERS.get_sections_start(h5)) num_segments = len(GETTERS.get_segments_confidence(h5)) example = pd.DataFrame( data=[ (artist_name, title, song_id, genre, year, key, key_confidence, mode, mode_confidence, time_signature, time_signature_confidence, duration, end_of_fade_in, loudness, song_hotttnesss, tempo, num_sections) ], columns=[ 'artist_name', 'title', 'song_id', 'genre', 'year', 'key', 'key_confidence', 'mode', 'mode_confidence', 'time_signature', 'time_signature_confidence', 'duration', 'end_of_fade_in', 'loudness', 'song_hotttnesss', 'tempo', 'num_segments' ]) features_vs_genre = features_vs_genre.append(example) h5.close() return features_vs_genre
def plots(track): f, axarr = plt.subplots(2, sharex=True) path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) segments = (GETTERS.get_segments_start(h5)) sections = (GETTERS.get_sections_start(h5)) max_loudness = (GETTERS.get_segments_loudness_max(h5)) loudness = (GETTERS.get_segments_loudness_start(h5)) average_loudness = (max_loudness + loudness) / 2 average_loudness_song = GETTERS.get_loudness(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) pitches = GETTERS.get_segments_pitches(h5) h5.close() axarr[0].set_title('loudness curve for ' + ut.get_track_info(track)) axarr[0].plot(segments, average_loudness, label='Filtered') axarr[0].axhline(average_loudness_song, color='green') for section in enumerate(sections): axarr[0].axvline(section[1], color='red') axarr[0].axvline(start_fade_out, color='green') axarr[0].axvline(end_fade_in, color='green') idx = list() for section in sections: dif = segments - section posdif = np.where(dif >=0) idx.append(posdif[0][0]) axarr[1].set_title('Chroma values for ' + ut.get_track_info(track)) #axarr[1].set_xticks(idx,sections.astype(int)) extent =[0,segments.shape[0],0,12] axarr[1].imshow(pitches.transpose(),extent = extent,aspect = 'auto',interpolation='nearest',origin='lower') plt.show()
def get_loudness(track,h5=None): #returns #0: the average loudness of the track #1: the variance of the loudness over the track #2: the difference between the highest and lowest dB value between the end of fade in and start of fade out close = (h5== None) if h5 == None: path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) loudness_avg = GETTERS.get_loudness(h5) loudnesses_interval = GETTERS.get_segments_loudness_max(h5) start_segments = GETTERS.get_segments_start(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) idx = np.where((start_segments > end_fade_in) & (start_segments < start_fade_out)) if close: h5.close() #return (loudness_avg, np.var(loudnesses_interval[idx]), min(loudnesses_interval[idx]), max(loudnesses_interval[idx])) return (loudness_avg, np.var(loudnesses_interval[idx]), abs(max(loudnesses_interval[idx]) - abs(min(loudnesses_interval[idx]))))
def plot_loudness_curve(track): path = "../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5" h5 = GETTERS.open_h5_file_read(path) segments = (GETTERS.get_segments_start(h5)) sections = (GETTERS.get_sections_start(h5)) max_loudness = (GETTERS.get_segments_loudness_max(h5)) loudness = (GETTERS.get_segments_loudness_start(h5)) average_loudness = (max_loudness + loudness) / 2 average_loudness_song = GETTERS.get_loudness(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) plt.title('loudness curve for ' + ut.get_track_info(track)) plt.ylabel('Loudness(dB)') plt.xlabel('Time(s)') plt.plot(segments, average_loudness, label='Filtered') plt.axhline(average_loudness_song, color='green',label='average') for section in enumerate(sections): plt.axvline(section[1], color='red') plt.axvline(start_fade_out, color='green') plt.axvline(end_fade_in, color='green') #plt.legend(('average')) h5.close() plt.show()
def get_feats(h5): f = [] f.append(hdf5_getters.get_artist_name(h5).decode('utf8').replace(',', '')) f.append(hdf5_getters.get_title(h5).decode('utf8').replace(',', '')) f.append(str(hdf5_getters.get_loudness(h5))) f.append(str(hdf5_getters.get_tempo(h5))) f.append(str(hdf5_getters.get_time_signature(h5))) f.append(str(hdf5_getters.get_key(h5))) f.append(str(hdf5_getters.get_mode(h5))) f.append(str(hdf5_getters.get_duration(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_timbre(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_pitches(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_loudness_max(h5))) f.extend( get_statistical_feats(hdf5_getters.get_segments_loudness_max_time(h5))) f.extend( get_statistical_feats(hdf5_getters.get_segments_loudness_start(h5))) f.append(str(hdf5_getters.get_song_hotttnesss(h5))) f.append(str(hdf5_getters.get_danceability(h5))) f.append(str(hdf5_getters.get_end_of_fade_in(h5))) f.append(str(hdf5_getters.get_energy(h5))) f.append(str(hdf5_getters.get_start_of_fade_out(h5))) f.append(str(hdf5_getters.get_year(h5))) return f
def get_loudness(track, h5=None): #returns #0: the average loudness of the track #1: the variance of the loudness over the track #2: the difference between the highest and lowest dB value between the end of fade in and start of fade out close = (h5 == None) if h5 == None: path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[ 3] + "/" + track[4] + "/" + track + ".h5" h5 = GETTERS.open_h5_file_read(path) loudness_avg = GETTERS.get_loudness(h5) loudnesses_interval = GETTERS.get_segments_loudness_max(h5) start_segments = GETTERS.get_segments_start(h5) start_fade_out = GETTERS.get_start_of_fade_out(h5) end_fade_in = GETTERS.get_end_of_fade_in(h5) idx = np.where((start_segments > end_fade_in) & (start_segments < start_fade_out)) if close: h5.close() #return (loudness_avg, np.var(loudnesses_interval[idx]), min(loudnesses_interval[idx]), max(loudnesses_interval[idx])) return (loudness_avg, np.var(loudnesses_interval[idx]), abs( max(loudnesses_interval[idx]) - abs(min(loudnesses_interval[idx]))))
def parse_aggregate_songs(file_name,file_name2,artist_map): """ Given an aggregate filename and artist_map in the format {artist_name: {data pertaining to artist}} """ """ TODO: -this function goes through each song, if artist not in there, add all data necesary and add first song info. else update any specific song info -song info is a map from attributename:[values] """ #artist_map = {} h5 = hdf5_getters.open_h5_file_read(file_name) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file...' for i in range(numSongs): artist_name = hdf5_getters.get_artist_name(h5,i) #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr) h5 = hdf5_getters.open_h5_file_read(file_name2) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file2...' for i in range(numSongs): song_id = hdf5_getters.get_track_id(h5,i) artist_name = hdf5_getters.get_artist_name(h5,i) if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']: continue #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr)
str(e) for e in GETTERS.get_artist_mbtags_count(h5, i)) # array artist_name = GETTERS.get_artist_name(h5, i) artist_playmeid = GETTERS.get_artist_playmeid(h5, i) artist_terms = ','.join( str(e) for e in GETTERS.get_artist_terms(h5, i)) # array #artist_terms_freq = ','.join(str(e) for e in GETTERS.get_artist_terms_freq(h5, i)) # array #artist_terms_weight = ','.join(str(e) for e in GETTERS.get_artist_terms_weight(h5, i)) # array #audio_md5 = GETTERS.get_audio_md5(h5, i) #bars_confidence = ','.join(str(e) for e in GETTERS.get_bars_confidence(h5, i)) # array #bars_start = ','.join(str(e) for e in GETTERS.get_bars_start(h5, i)) # array #beats_confidence = ','.join(str(e) for e in GETTERS.get_beats_confidence(h5, i)) # array #beats_start = ','.join(str(e) for e in GETTERS.get_beats_start(h5, i)) # array danceability = GETTERS.get_danceability(h5, i) duration = GETTERS.get_duration(h5, i) end_of_fade_in = GETTERS.get_end_of_fade_in(h5, i) energy = GETTERS.get_energy(h5, i) key = GETTERS.get_key(h5, i) key_confidence = GETTERS.get_key_confidence(h5, i) loudness = GETTERS.get_loudness(h5, i) mode = GETTERS.get_mode(h5, i) mode_confidence = GETTERS.get_mode_confidence(h5, i) release = GETTERS.get_release(h5, i) release_7digitalid = GETTERS.get_release_7digitalid(h5, i) #sections_confidence = ','.join(str(e) for e in GETTERS.get_sections_confidence(h5, i)) # array #sections_start = ','.join(str(e) for e in GETTERS.get_sections_start(h5, i)) # array #segments_confidence = ','.join(str(e) for e in GETTERS.get_segments_confidence(h5, i)) # array #segments_loudness_max = ','.join(str(e) for e in GETTERS.get_segments_loudness_max(h5, i)) # array #segments_loudness_max_time = ','.join(str(e) for e in GETTERS.get_segments_loudness_max_time(h5, i)) # array #segments_loudness_start = ','.join(str(e) for e in GETTERS.get_segments_loudness_start(h5, i)) # array #segments_pitches = ','.join(str(e) for e in GETTERS.get_segments_pitches(h5, i)) # array
def main(): outputFile1 = open('SongCSV.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input("\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"+ " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'track_id'.lower(): csvRowString += 'track_id' elif attribute == 'artist_familiarity'.lower(): csvRowString += 'artist_familiarity' elif attribute == 'artist_hotttnesss'.lower(): csvRowString += 'artist_hotttnesss' elif attribute == 'artist_mbid'.lower(): csvRowString += 'artist_mbid' elif attribute == 'artist_playmeid'.lower(): csvRowString += 'artist_playmeid' elif attribute == 'artist_7digitalid'.lower(): csvRowString += 'artist_7digitalid' elif attribute == 'release'.lower(): csvRowString += 'release' elif attribute == 'release_7digitalid'.lower(): csvRowString += 'release_7digitalid' elif attribute == 'song_hotttnesss'.lower(): csvRowString += 'song_hotttnesss' elif attribute == 'track_7digitalid'.lower(): csvRowString += 'track_7digitalid' elif attribute == 'analysis_sample_rate'.lower(): csvRowString += 'analysis_sample_rate' elif attribute == 'audio_md5'.lower(): csvRowString += 'audio_md5' elif attribute == 'end_of_fade_in'.lower(): csvRowString += 'end_of_fade_in' elif attribute == 'energy'.lower(): csvRowString += 'energy' elif attribute == 'key'.lower(): csvRowString += 'key' elif attribute == 'key_confidence'.lower(): csvRowString += 'key_confidence' elif attribute == 'loudness'.lower(): csvRowString += 'loudness' elif attribute == 'mode'.lower(): csvRowString += 'mode' elif attribute == 'mode_confidence'.lower(): csvRowString += 'mode_confidence' elif attribute == 'start_of_fade_out'.lower(): csvRowString += 'start_of_fade_out' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString); csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ("SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,"+ "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,"+ "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"+ "Title,Year,track_id,artist_hotttnesss,artist_mbid,artist_playmeid,artist_7digitalid,"+ "release,release_7digitalid,song_hotttnesss,track_7digitalid,analysis_sample_rate,audio_md5,"+ "end_of_fade_in,energy,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() outputFile1.write("SongNumber,"); outputFile1.write(csvRowString + "\n"); csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "/vagrant/genrepython/MillionSongSubset" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str(hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str(hdf5_getters.get_key_confidence(songH5File)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str(hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str(hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) song.track_id = str(hdf5_getters.get_track_id(songH5File)) song.artist_familiarity = str(hdf5_getters.get_artist_familiarity(songH5File)) song.artist_hotttnesss = str(hdf5_getters.get_artist_hotttnesss(songH5File)) song.artist_mbid = str(hdf5_getters.get_artist_mbid(songH5File)) song.artist_playmeid = str(hdf5_getters.get_artist_playmeid(songH5File)) song.artist_7digitalid = str(hdf5_getters.get_artist_7digitalid(songH5File)) song.release = str(hdf5_getters.get_release(songH5File)) song.release_7digitalid = str(hdf5_getters.get_release_7digitalid(songH5File)) song.song_hotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File)) song.track_7digitalid = str(hdf5_getters.get_track_7digitalid(songH5File)) song.analysis_sample_rate = str(hdf5_getters.get_analysis_sample_rate(songH5File)) song.audio_md5 = str(hdf5_getters.get_audio_md5(songH5File)) song.end_of_fade_in = str(hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.key_confidence = str(hdf5_getters.get_key_confidence(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.mode_confidence = str(hdf5_getters.get_mode_confidence(songH5File)) song.start_of_fade_out = str(hdf5_getters.get_start_of_fade_out(songH5File)) #print song count csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',',"") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',','') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'track_id'.lower(): csvRowString += song.track_id elif attribute == 'artist_familiarity'.lower(): csvRowString += song.artist_familiarity elif attribute == 'artist_hotttnesss'.lower(): csvRowString += song.artist_hotttnesss elif attribute == 'artist_mbid'.lower(): csvRowString += song.artist_mbid elif attribute == 'artist_playmeid'.lower(): csvRowString += song.artist_playmeid elif attribute == 'artist_7digitalid'.lower(): csvRowString += song.artist_7digitalid elif attribute == 'release'.lower(): csvRowString += song.release elif attribute == 'release_7digitalid'.lower(): csvRowString += song.release_7digitalid elif attribute == 'song_hotttnesss'.lower(): csvRowString += song.song_hotttnesss elif attribute == 'track_7digitalid'.lower(): csvRowString += song.track_7digitalid elif attribute == 'analysis_sample_rate'.lower(): csvRowString += song.analysis_sample_rate elif attribute == 'audio_md5'.lower(): csvRowString += song.audio_md5 elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'energy'.lower(): csvRowString += song.energy elif attribute == 'key'.lower(): csvRowString += song.key elif attribute == 'key_confidence'.lower(): csvRowString += song.key_confidence elif attribute == 'loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_confidence'.lower(): csvRowString += song.mode_confidence elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def main(): basedir = "./../songMetaInfo.txt" ext = ".h5" if len(sys.argv) > 1: basedir = sys.argv[1] outputfile = 'SongFileMetaData.csv' if len(sys.argv) > 2: outputfile = sys.argv[2] csvWriter = open(outputfile, 'w') csvWriter.write( "title,songId,artistId,artistfamilarity,artistHotness,songHotness," + "songEnfOfFadeIn,startFadeout,energy,loudness,albumID,albumName,artistName,danceability,duration,keySignatureConfidence,tempo,timeSignature,timeSignatureConfidence,year\n" ) with open(basedir) as file: for line in file.readlines(): f = line.strip() #newf = f + "text" print f #print f try: songH5File = hdf5_getters.open_h5_file_read(f) csvStr = "" #0 title = str(hdf5_getters.get_title(songH5File)) csvStr += title + "," #1 songId = str(hdf5_getters.get_song_id(songH5File)) csvStr += songId + "," #2 artistId = str(hdf5_getters.get_artist_id(songH5File)) csvStr += artistId + "," #3 artistfamilarity = str( hdf5_getters.get_artist_familiarity(songH5File)) csvStr += artistfamilarity + "," #4 artistHotness = str( hdf5_getters.get_artist_hotttnesss(songH5File)) csvStr += artistHotness + "," #5 songHotness = str(hdf5_getters.get_song_hotttnesss(songH5File)) csvStr += songHotness + "," #6 songEnfOfFadeIn = str( hdf5_getters.get_end_of_fade_in(songH5File)) csvStr += songEnfOfFadeIn + "," #7 startFadeOut = str( hdf5_getters.get_start_of_fade_out(songH5File)) csvStr += startFadeOut + "," #8 energy = str(hdf5_getters.get_energy(songH5File)) csvStr += energy + "," #9 loudness = str(hdf5_getters.get_loudness(songH5File)) csvStr += loudness + "," #10 albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) csvStr += albumID + "," #11 albumName = str(hdf5_getters.get_release(songH5File)) csvStr += albumName + "," #12 artistName = str(hdf5_getters.get_artist_name(songH5File)) csvStr += artistName + "," #13 danceability = str(hdf5_getters.get_danceability(songH5File)) csvStr += danceability + "," #14 duration = str(hdf5_getters.get_duration(songH5File)) csvStr += duration + "," #15 keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) csvStr += keySignatureConfidence + "," #16 tempo = str(hdf5_getters.get_tempo(songH5File)) csvStr += tempo + "," ## 17 timeSignature = str( hdf5_getters.get_time_signature(songH5File)) csvStr += timeSignature + "," #18 timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) csvStr += timeSignatureConfidence + "," #19 year = str(hdf5_getters.get_year(songH5File)) csvStr += year + "," #print song count csvStr += "\n" csvWriter.write(csvStr) #print csvStr songH5File.close() except: print "Error in processing file" csvWriter.close()
def main(): outputFile1 = open('SongCSV.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Familiarity'.lower(): ####Added by us! csvRowString += song.familiarity elif attribute == 'artist_mbid'.lower(): csvRowString += song.artist_mbid elif attribute == 'artist_playmeid'.lower(): csvRowString += song.artist_playmeid elif attribute == 'artist_7digid'.lower(): csvRowString += song.artist_7digid elif attribute == 'hottness'.lower(): csvRowString += song.hottness elif attribute == 'song_hottness'.lower(): csvRowString += song.song_hottness elif attribute == 'digitalid7'.lower(): csvRowString += song.digitalid7 elif attribute == 'similar_artists'.lower(): csvRowString += song.similar_artists elif attribute == 'artist_terms'.lower(): csvRowString += song.artist_terms elif attribute == 'art_terms_freq'.lower(): csvRowString += song.art_terms_freq elif attribute == 'art_terms_weight'.lower(): csvRowString += song.art_terms_weight elif attribute == 'a_sample_rate'.lower(): csvRowString += song.a_sample_rate elif attribute == 'audio_md5'.lower(): csvRowString += song.audio_md5 elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'energy'.lower(): csvRowString += song.energy elif attribute == 'loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_conf'.lower(): csvRowString += song.mode_conf elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out elif attribute == 'trackid'.lower(): csvRowString += song.trackid elif attribute == 'segm_start'.lower(): csvRowString += song.segm_start elif attribute == 'segm_conf'.lower(): csvRowString += song.segm_conf elif attribute == 'segm_pitch'.lower(): csvRowString += song.segm_pitch elif attribute == 'segm_timbre'.lower(): csvRowString += song.segm_timbre elif attribute == 'segm_max_loud'.lower(): csvRowString += song.segm_max_loud elif attribute == 'segm_max_loud_time'.lower(): csvRowString += song.segm_max_loud_time elif attribute == 'segm_loud_start'.lower(): csvRowString += song.segm_loud_start elif attribute == 'sect_start'.lower(): csvRowString += song.sect_start elif attribute == 'sect_conf'.lower(): csvRowString += song.sect_conf elif attribute == 'beats_start'.lower(): csvRowString += song.beats_start elif attribute == 'beats_conf'.lower(): csvRowString += song.beats_conf elif attribute == 'bars_start'.lower(): csvRowString += song.bars_start elif attribute == 'bars_conf'.lower(): csvRowString += song.bars_conf elif attribute == 'tatums_start'.lower(): csvRowString += song.tatums_start elif attribute == 'tatums_conf'.lower(): csvRowString += song.tatums_conf elif attribute == 'artist_mbtags'.lower(): csvRowString += song.artist_mbtags elif attribute == 'artist_mbtags_count'.lower(): csvRowString += song.artist_mbtags_count elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print("==============") print("I believe there has been an error with the input.") print("==============") break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = "SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Familiarity,Artist_Mbid,Artist_PlaymeId,Artist_7didId,Hottness,Song_Hottness,7digitalid,A_Sample_Rate,Audio_Md5,End_Of_Fade_In,Energy,Loudness,Mode,Mode_Conf,Start_Of_Fade_Out,TrackId" ################################################# csvAttributeList = re.split(',', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "/home/bigdata/smalltest/" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) # testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) #########Added by us! song.familiarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.artist_mbid = str(hdf5_getters.get_artist_mbid(songH5File)) song.artist_playmeid = str( hdf5_getters.get_artist_playmeid(songH5File)) song.artist_7digid = str( hdf5_getters.get_artist_7digitalid(songH5File)) song.hottness = str(hdf5_getters.get_artist_hotttnesss(songH5File)) song.song_hottness = str( hdf5_getters.get_song_hotttnesss(songH5File)) song.digitalid7 = str( hdf5_getters.get_track_7digitalid(songH5File)) #song.similar_artists = str(hdf5_getters.get_similar_artists(songH5File)) #song.artist_terms = str(hdf5_getters.get_artist_terms(songH5File)) #song.art_terms_freq = str(hdf5_getters.get_artist_terms_freq(songH5File)) #song.art_terms_weight = str(hdf5_getters.get_artist_terms_weight(songH5File)) song.a_sample_rate = str( hdf5_getters.get_analysis_sample_rate(songH5File)) song.audio_md5 = str(hdf5_getters.get_audio_md5(songH5File)) song.end_of_fade_in = str( hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.mode_conf = str(hdf5_getters.get_mode_confidence(songH5File)) song.start_of_fade_out = str( hdf5_getters.get_start_of_fade_out(songH5File)) song.trackid = str(hdf5_getters.get_track_id(songH5File)) #song.segm_start = str(hdf5_getters.get_segments_start(songH5File)) #song.segm_conf = str(hdf5_getters.get_segments_confidence(songH5File)) #song.segm_pitch = str(hdf5_getters.get_segments_pitches(songH5File)) #song.segm_timbre = str(hdf5_getters.get_segments_timbre(songH5File)) #song.segm_max_loud = str(hdf5_getters.get_segments_loudness_max(songH5File)) #song.segm_max_loud_time = str(hdf5_getters.get_segments_loudness_max_time(songH5File)) #song.segm_loud_start = str(hdf5_getters.get_segments_loudness_start(songH5File)) #song.sect_start = str(hdf5_getters.get_sections_start(songH5File)) #song.sect_conf = str(hdf5_getters.get_sections_confidence(songH5File)) #song.beats_start = str(hdf5_getters.get_beats_start(songH5File)) #song.beats_conf = str(hdf5_getters.get_beats_confidence(songH5File)) #song.bars_start = str(hdf5_getters.get_bars_start(songH5File)) #song.bars_conf = str(hdf5_getters.get_bars_confidence(songH5File)) #song.tatums_start = str(hdf5_getters.get_tatums_start(songH5File)) #song.tatums_conf = str(hdf5_getters.get_tatums_confidence(songH5File)) #song.artist_mbtags = str(hdf5_getters.get_artist_mbtags(songH5File)) #song.artist_mbtags_count = str(hdf5_getters.get_artist_mbtags_count(songH5File)) #print song count #csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace("b\"", "") albumName = albumName.replace("\"", "") albumName = albumName.replace(',', "") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',', '') location = location.replace("b\"", "") location = location.replace("\"", "") csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): artistName = song.artistName artistName = artistName.replace("b\"", "") artistName = artistName.replace("\"", "") csvRowString += "\"" + artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): t = song.title t = t.replace("b\"", "") t = t.replace("\"", "") csvRowString += "\"" + t + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'Familiarity'.lower(): ####Added by us! csvRowString += song.familiarity elif attribute == 'artist_mbid'.lower(): csvRowString += "\"" + song.artist_mbid + "\"" elif attribute == 'artist_playmeid'.lower(): csvRowString += song.artist_playmeid elif attribute == 'artist_7digid'.lower(): csvRowString += song.artist_7digid elif attribute == 'hottness'.lower(): csvRowString += song.hottness elif attribute == 'song_hottness'.lower(): csvRowString += song.song_hottness elif attribute == 'digitalid7'.lower(): csvRowString += song.digitalid7 elif attribute == 'similar_artists'.lower(): csvRowString += song.similar_artists elif attribute == 'artist_terms'.lower(): csvRowString += song.artist_terms elif attribute == 'art_terms_freq'.lower(): csvRowString += song.art_terms_freq elif attribute == 'art_terms_weight'.lower(): csvRowString += song.art_terms_weight elif attribute == 'a_sample_rate'.lower(): csvRowString += song.a_sample_rate elif attribute == 'audio_md5'.lower(): csvRowString += "\"" + song.audio_md5 + "\"" elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'energy'.lower(): csvRowString += song.energy elif attribute == 'loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_conf'.lower(): csvRowString += song.mode_conf elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out elif attribute == 'trackid'.lower(): csvRowString += "\"" + song.trackid + "\"" elif attribute == 'segm_start'.lower(): csvRowString += song.segm_start elif attribute == 'segm_conf'.lower(): csvRowString += song.segm_conf elif attribute == 'segm_pitch'.lower(): csvRowString += song.segm_pitch elif attribute == 'segm_timbre'.lower(): csvRowString += song.segm_timbre elif attribute == 'segm_max_loud'.lower(): csvRowString += song.segm_max_loud elif attribute == 'segm_max_loud_time'.lower(): csvRowString += song.segm_max_loud_time elif attribute == 'segm_loud_start'.lower(): csvRowString += song.segm_loud_start elif attribute == 'sect_start'.lower(): csvRowString += song.sect_start elif attribute == 'sect_conf'.lower(): csvRowString += song.sect_conf elif attribute == 'beats_start'.lower(): csvRowString += song.beats_start elif attribute == 'beats_conf'.lower(): csvRowString += song.beats_conf elif attribute == 'bars_start'.lower(): csvRowString += song.bars_start elif attribute == 'bars_conf'.lower(): csvRowString += song.bars_conf elif attribute == 'tatums_start'.lower(): csvRowString += song.tatums_start elif attribute == 'tatums_conf'.lower(): csvRowString += song.tatums_conf elif attribute == 'artist_mbtags'.lower(): csvRowString += song.artist_mbtags elif attribute == 'artist_mbtags_count'.lower(): csvRowString += song.artist_mbtags_count else: csvRowString += "\"ERR\"" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def getData(starting_point): starting = starting_point * 10000 files = glob.glob('/mnt/snap/data/*/*/*/*.h5') file_one_round = files[starting:starting + 10000] artist_ids = [] song_beats_persecond = [] song_duration = [] song_end_fade_in = [] song_start_fade_out = [] song_key = [] song_loudness = [] song_segments_loudness_max = [] song_segments_loudness_min = [] song_segments_loudness_med = [] song_segments_loudness_time_max = [] song_segments_loudness_time_min = [] song_segments_loudness_time_med = [] song_mode = [] song_sections_start = [] song_pitches = [] song_timbre = [] song_tempo = [] song_time_signature = [] song_title = [] artist_name = [] year = [] idx = np.triu_indices(12) #count = 1 for f in file_one_round: h5 = HDF5.open_h5_file_read(f) songYear = g.get_year(h5) if songYear < 1990: continue artist_id = g.get_artist_id(h5) song_beat = (g.get_beats_start(h5)).tolist() songDuration = g.get_duration(h5) song_beat_persecond = float(len(song_beat)) / songDuration song_end_fadein = g.get_end_of_fade_in(h5) song_start_fadeout = g.get_start_of_fade_out(h5) songKey = g.get_key(h5) songLoudness = g.get_loudness(h5) song_loudness_max = (g.get_segments_loudness_max(h5)) // 10 song_loudness_antilog = np.power(10, song_loudness_max) song_segmentsLoudness_max = np.amax(song_loudness_antilog) song_segmentsLoudness_min = np.amin(song_loudness_antilog) song_segmentsLoudness_med = np.median(song_loudness_antilog) song_segmentsLoudness_max_time = ( g.get_segments_loudness_max_time(h5)).tolist() song_loudness_time = np.multiply(song_loudness_antilog, song_segmentsLoudness_max_time) song_segmentsLoudnessTime_max = np.amax(song_loudness_time) song_segmentsLoudnessTime_min = np.amin(song_loudness_time) song_segmentsLoudnessTime_med = np.median(song_loudness_time) songMode = g.get_mode(h5) song_sectionsStart = (g.get_sections_start(h5)).tolist() songPitches = g.get_segments_pitches(h5) songPitches_cov = np.cov(songPitches, rowvar=False) songPitches_mean = np.mean(songPitches, axis=0) #print(songPitches_cov.shape) songTimbre = g.get_segments_timbre(h5) songTimbre_cov = np.cov(songTimbre, rowvar=False) songTimbre_mean = np.mean(songTimbre, axis=0) #print(songTimbre_cov.shape) songTempo = g.get_tempo(h5) songTime_signature = g.get_time_signature(h5) songTitle = g.get_title(h5) artistName = g.get_artist_name(h5) artist_ids.append(artist_id) song_beats_persecond.append(song_beat_persecond) song_duration.append(songDuration) song_end_fade_in.append(song_end_fadein) song_start_fade_out.append(song_start_fadeout) song_key.append(songKey) song_loudness.append(songLoudness) song_segments_loudness_max.append(song_segmentsLoudness_max) song_segments_loudness_min.append(song_segmentsLoudness_min) song_segments_loudness_med.append(song_segmentsLoudness_med) song_segments_loudness_time_max.append(song_segmentsLoudnessTime_max) song_segments_loudness_time_min.append(song_segmentsLoudnessTime_min) song_segments_loudness_time_med.append(song_segmentsLoudnessTime_med) song_mode.append(songMode) song_sections_start.append(song_sectionsStart) pitches_mean_cov = (songPitches_cov[idx]).tolist() pitches_mean_cov.extend((songPitches_mean).tolist()) song_pitches.append(pitches_mean_cov) timbre_mean_cov = (songTimbre_cov[idx]).tolist() timbre_mean_cov.extend((songTimbre_mean).tolist()) song_timbre.append(timbre_mean_cov) song_tempo.append(songTempo) song_time_signature.append(songTime_signature) song_title.append(songTitle) artist_name.append(artistName) year.append(songYear) #print(count) #count = count + 1 h5.close() #def createDictsFrom2DArray(dictionary, colName, featureList): # for i in range(0,12): # dictionary[colName+str(i)] = featureList[i] #i = 1 #for t in itertools.izip_longest(*featureList): # dictionary[colName+str(i)] = t # i = i + 1 # return dictionary data = collections.OrderedDict() data['year'] = year data['artist_name'] = artist_name data['artist_id'] = artist_ids data['song_title'] = song_title data['song_beats_persecond'] = song_beats_persecond data['song_duration'] = song_duration data['song_end_fade_in'] = song_end_fade_in data['song_start_fade_out'] = song_start_fade_out data['song_key'] = song_key data['song_loudness'] = song_loudness data['song_loudness_max'] = song_segments_loudness_max data['song_loudness_min'] = song_segments_loudness_min data['song_loudness_med'] = song_segments_loudness_med data['song_loudness_time_max'] = song_segments_loudness_time_max data['song_loudness_time_min'] = song_segments_loudness_time_min data['song_loudness_time_med'] = song_segments_loudness_time_med data['song_mode'] = song_mode data['song_tempo'] = song_tempo data['song_time_signature'] = song_time_signature data = createDictsFrom1DArray(data, 'pitches', song_pitches) data = createDictsFrom1DArray(data, 'timbre', song_timbre) data = createDictsFrom1DArray(data, 'sections_start', song_sections_start) df = pd.DataFrame(data) print('before return ' + str(starting_point)) return df
def convert_to_csv(): i = 0 data = [] target = [] count = 0 with open('data_timbre.csv', 'w+') as f: with open('target_timbre.csv', 'w+') as f2: writer = csv.writer(f) target_writer = csv.writer(f2) for root, dirs, files in os.walk(msd_subset_data_path): files = glob.glob(os.path.join(root, '*.h5')) for f in sorted(files): try: # Opening is very prone to causing exceptions, we'll just skip file if exception is thrown h5 = getter.open_h5_file_read(f) year = getter.get_year(h5) if year: count += 1 analysis_file = open('current_analysis_status.txt', 'a') update = "Currently at file name: " + str( f) + " and at number " + str(count) + "\n" analysis_file.write(update) print update analysis_file.close() target.append([year]) row = [] timbre = getter.get_segments_timbre(h5) segstarts = getter.get_segments_start(h5) btstarts = getter.get_beats_start(h5) duration = getter.get_duration(h5) end_of_fade_in = getter.get_end_of_fade_in(h5) key = getter.get_key(h5) key_confidence = getter.get_key_confidence(h5) loudness = getter.get_loudness(h5) start_of_fade_out = getter.get_start_of_fade_out( h5) tempo = getter.get_tempo(h5) time_signature = getter.get_time_signature(h5) time_signature_confidence = getter.get_time_signature_confidence( h5) h5.close() # VERY IMPORTANT segstarts = np.array(segstarts).flatten() btstarts = np.array(btstarts).flatten() bttimbre = align_feats( timbre.T, segstarts, btstarts, duration, end_of_fade_in, key, key_confidence, loudness, start_of_fade_out, tempo, time_signature, time_signature_confidence) if bttimbre is None: continue # Skip this track, some features broken npicks, winsize, finaldim = 12, 12, 144 # Calculated by 12 * 12. 12 is fixed as number of dimensions. processed_feats = extract_and_compress( bttimbre, npicks, winsize, finaldim) n_p_feats = processed_feats.shape[0] if processed_feats is None: continue # Skip this track, some features broken row = processed_feats.flatten() if len( row ) != 12 * 144: # 12 dimensions * 144 features per dimension continue # Not enough features year_row = np.array([year]) if row.any() and year_row.any(): writer.writerow(row) target_writer.writerow(year_row) i += 1 else: h5.close() except Exception: pass print 'Finished!' analysis_file = open('current_analysis_status.txt', 'a') analysis_file.write('Done!') analysis_file.close() return
def classify(h5): output_array={} # duration duration=hdf5_getters.get_duration(h5) output_array["duration"]=duration ### ADDED VALUE TO ARRAY # number of bars bars=hdf5_getters.get_bars_start(h5) num_bars=len(bars) output_array["num_bars"]=num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length=numpy.ediff1d(bars) variance_bar_length=numpy.var(bar_length) output_array["variance_bar_length"]=variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats=hdf5_getters.get_beats_start(h5) num_beats=len(beats) output_array["num_beats"]=num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length=numpy.ediff1d(beats) variance_beats_length=numpy.var(bar_length) output_array["variance_beats_length"]=variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability=hdf5_getters.get_danceability(h5) output_array["danceability"]=danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"]=end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy=hdf5_getters.get_energy(h5) output_array["energy"]=energy ### ADDED VALUE TO ARRAY # key key=hdf5_getters.get_key(h5) output_array["key"]=int(key) ### ADDED VALUE TO ARRAY # loudness loudness=hdf5_getters.get_loudness(h5) output_array["loudness"]=loudness ### ADDED VALUE TO ARRAY # mode mode=hdf5_getters.get_mode(h5) output_array["mode"]=int(mode) ### ADDED VALUE TO ARRAY # number sections sections=hdf5_getters.get_sections_start(h5) num_sections=len(sections) output_array["num_sections"]=num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length=numpy.ediff1d(sections) variance_sections_length=numpy.var(sections) output_array["variance_sections_length"]=variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments=hdf5_getters.get_segments_start(h5) num_segments=len(segments) output_array["num_segments"]=num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length=numpy.ediff1d(segments) variance_segments_length=numpy.var(segments) output_array["variance_segments_length"]=variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5) segment_loudness_max_index=0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]: segment_loudness_max_index=i segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index] segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index] output_array["segment_loudness_max"]=segment_loudness_max ### ADDED VALUE TO ARRAY output_array["segment_loudness_time"]=segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean=numpy.mean(segment_loudness_array) output_array["segment_loudness_mean"]=segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance=numpy.var(segment_loudness_array) output_array["segment_loudness_variance"]=segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array=hdf5_getters.get_segments_pitches(h5) segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_mean"]=segment_pitches_mean # segment pitches variance (start) segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_variance"]=segment_pitches_variance # segment timbres segment_timbres_array=hdf5_getters.get_segments_timbre(h5) segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_mean"]=segment_timbres_mean # segment timbres variance (start) segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_variance"]=segment_timbres_variance # hotttnesss hottness=hdf5_getters.get_song_hotttnesss(h5,0) output_array["hottness"]=hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5) fade_out=duration-start_of_fade_out output_array["fade_out"]=fade_out ### ADDED VALUE TO ARRAY # tatums tatums=hdf5_getters.get_tatums_start(h5) num_tatums=len(tatums) output_array["num_tatums"]=num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length=numpy.ediff1d(tatums) variance_tatums_length=numpy.var(tatums_length) output_array["variance_tatums_length"]=variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo=hdf5_getters.get_tempo(h5) output_array["tempo"]=tempo ### ADDED VALUE TO ARRAY # time signature time_signature=hdf5_getters.get_time_signature(h5) output_array["time_signature"]=int(time_signature) ### ADDED VALUE TO ARRAY # year year=hdf5_getters.get_year(h5) output_array["year"]=int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms=hdf5_getters.get_artist_terms(h5,0) output_array["artist_terms"]=artist_terms.tolist() artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0) output_array["artist_terms_freq"]=artist_terms_freq.tolist() artist_name=hdf5_getters.get_artist_name(h5,0) output_array["artist_name"]=artist_name artist_id=hdf5_getters.get_artist_id(h5,0) output_array["artist_id"]=artist_id # title title=hdf5_getters.get_title(h5,0) output_array["title"]=title return output_array
def main(): dataset_dir = sys.argv[1] feat =[] feat1=[] feat2=[] feat3=[] feat4=[] print "Forming Dataset..." listing1 = os.listdir(dataset_dir) for a in listing1: listing2 = os.listdir(dataset_dir+a+'/') for b in listing2: listing3 = os.listdir(dataset_dir+a+'/'+b+'/') for c in listing3: listing4 = os.listdir(dataset_dir+a+'/'+b+'/'+c+'/') for d in listing4: h5 = hdf5_getters.open_h5_file_read(dataset_dir+a+'/'+b+'/'+c+'/'+d) feat =[] feat1=[] feat2=[] feat3=[] feat4=[] temp = hdf5_getters.get_artist_hotttnesss(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue temp = hdf5_getters.get_artist_familiarity(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue temp = hdf5_getters.get_end_of_fade_in(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_key_confidence(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_loudness(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_mode_confidence(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_sections_confidence(h5) if temp.size == 0: h5.close() continue temp = hdf5_getters.get_segments_confidence(h5) if temp.size == 0: h5.close() continue temp = hdf5_getters.get_segments_loudness_max(h5) if temp.size == 0: h5.close() continue temp = hdf5_getters.get_segments_loudness_max_time(h5) if temp.size == 0: h5.close() continue temp = hdf5_getters.get_segments_pitches(h5) if temp.size == 0: h5.close() continue temp = hdf5_getters.get_segments_timbre(h5) if temp.size == 0: h5.close() continue temp = hdf5_getters.get_start_of_fade_out(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_tempo(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_time_signature_confidence(h5) if (math.isnan(temp)): h5.close() continue temp = hdf5_getters.get_year(h5) if temp == 0: h5.close() continue temp = hdf5_getters.get_artist_terms(h5) if temp.size == 0: h5.close() continue temp_ = hdf5_getters.get_artist_terms_weight(h5) if temp_.size == 0: continue temp = hdf5_getters.get_bars_confidence(h5) sz = temp.size if sz<50: h5.close() continue temp = hdf5_getters.get_beats_confidence(h5) sz = temp.size if sz <50: h5.close() continue mm = np.mean(temp) vv = np.var(temp) if mm==0.0 and vv==0.0: h5.close() continue temp = hdf5_getters.get_segments_confidence(h5) sz = temp.size if sz <50: h5.close() continue temp = hdf5_getters.get_tatums_confidence(h5) sz = temp.size if sz <50: h5.close() continue temp = hdf5_getters.get_song_hotttnesss(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue temp = hdf5_getters.get_bars_confidence(h5) sz = temp.size sz1 = sz/50 i=1 j=0 while i<=50: if i == 50: sz2 = sz else: sz2 = i*sz1 num=0.0 acc = 0 while j<sz2: acc += temp[j] j+=1 num+=1.0 mm = acc/num feat1.append(mm) i+=1 temp = hdf5_getters.get_beats_confidence(h5) sz = temp.size sz1 = sz/50 i=1 j=0 while i<=50: if i == 50: sz2 = sz else: sz2 = i*sz1 num=0.0 acc = 0 while j<sz2: acc += temp[j] j+=1 num+=1.0 mm = acc/num feat2.append(mm) i+=1 temp = hdf5_getters.get_segments_confidence(h5) sz = temp.size sz1 = sz/50 i=1 j=0 while i<=50: if i == 50: sz2 = sz else: sz2 = i*sz1 num=0.0 acc = 0 while j<sz2: acc += temp[j] j+=1 num+=1.0 mm = acc/num feat3.append(mm) i+=1 temp = hdf5_getters.get_tatums_confidence(h5) sz = temp.size sz1 = sz/50 i=1 j=0 while i<=50: if i == 50: sz2 = sz else: sz2 = i*sz1 num=0.0 acc = 0 while j<sz2: acc += temp[j] j+=1 num+=1.0 mm = acc/num feat4.append(mm) i+=1 i=0 avg = 0.0 while i<50: avg = (feat1[i] + feat2[i] + feat3[i] + feat4[i])/4.0 feat.append(avg) i++ temp = hdf5_getters.get_song_hotttnesss(h5) hott = 0 if temp >=0.75: hott = 1 elif temp >=0.40 and temp <0.75: hott = 2 else: hott = 3 feat.append(hott) h5.close() count = 1 f=open('MSD_DATASET_LSTM.txt', 'a') outstring='' cnt = 0 feat_size = len(feat) for i in feat: cnt+=1 outstring+=str(i) if (cnt!=feat_size): outstring+=',' outstring+='\n' f.write(outstring) f.close()
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,duration,samp_rt,artist_7digitalid,artist_fam,artist_hotness,artist_id,artist_lat,artist_loc,artist_lon,artist_mbid,genre_array[0],genre_array[1],genre_array[2], genre_array[3],genre_array[4],genre_array[5],genre_array[6],genre_array[7],genre_array[8],genre_array[9],genre_array[10],genre_array[11],genre_array[12],genre_array[13],genre_array[14],genre_array[15], genre_array[16],genre_array[17],genre_array[18],genre_array[19],genre_array[20],genre_array[21],genre_array[22],genre_array[23],genre_array[24],genre_array[25],genre_array[26], genre_array[27],genre_array[28],genre_array[29],genre_array[30],genre_array[31],genre_array[32],genre_array[33],genre_array[34],genre_array[35],genre_array[36],genre_array[37],genre_array[38], genre_array[39],genre_array[40],genre_array[41],genre_array[42],genre_array[43],genre_array[44],genre_array[45],genre_array[46],genre_array[47],genre_array[48],genre_array[49], genre_array[50],genre_array[51],genre_array[52],genre_array[53],genre_array[54],genre_array[55],genre_array[56],genre_array[57],genre_array[58],genre_array[59], genre_array[60],genre_array[61],genre_array[62],genre_array[63],genre_array[64],genre_array[65],genre_array[66],genre_array[67],genre_array[68],genre_array[69], genre_array[70],genre_array[71],genre_array[72],genre_array[73],genre_array[74],genre_array[75],genre_array[76],genre_array[77],genre_array[78],genre_array[79], genre_array[80],genre_array[81],genre_array[82],genre_array[83],genre_array[84],genre_array[85],genre_array[86],genre_array[87],genre_array[88],genre_array[89], genre_array[90],genre_array[91],genre_array[92],genre_array[93],genre_array[94],genre_array[95],genre_array[96],genre_array[97],genre_array[98],genre_array[99],genre_array[100],genre_array[101], genre_array[102],genre_array[103],genre_array[104],genre_array[105],genre_array[106],genre_array[107],genre_array[108],genre_array[109],genre_array[110],genre_array[111],genre_array[112], genre_array[113],genre_array[114],genre_array[115],genre_array[116],genre_array[117],genre_array[118],genre_array[119],genre_array[120],genre_array[121],genre_array[122],genre_array[123], genre_array[124],genre_array[125],genre_array[126],genre_array[127],genre_array[128],genre_array[129],genre_array[130],genre_array[131],genre_array[132], artist_pmid,audio_md5,danceability,end_fade_in,energy,song_key,key_c,loudness,mode,mode_conf,release_7digitalid,song_hot,song_id,start_fade_out,tempo,time_sig,time_sig_c,track_id,track_7digitalid,year,bars_c_avg,bars_c_max,bars_c_min,bars_c_stddev,bars_c_count,bars_c_sum,bars_start_avg,bars_start_max,bars_start_min,bars_start_stddev,bars_start_count,bars_start_sum,beats_c_avg,beats_c_max,beats_c_min,beats_c_stddev,beats_c_count,beats_c_sum,beats_start_avg,beats_start_max,beats_start_min, beats_start_stddev,beats_start_count,beats_start_sum, sec_c_avg,sec_c_max,sec_c_min,sec_c_stddev,sec_c_count,sec_c_sum,sec_start_avg,sec_start_max,sec_start_min,sec_start_stddev,sec_start_count,sec_start_sum,seg_c_avg,seg_c_max,seg_c_min,seg_c_stddev,seg_c_count,seg_c_sum,seg_loud_max_avg,seg_loud_max_max,seg_loud_max_min,seg_loud_max_stddev,seg_loud_max_count,seg_loud_max_sum,seg_loud_max_time_avg,seg_loud_max_time_max,seg_loud_max_time_min,seg_loud_max_time_stddev,seg_loud_max_time_count,seg_loud_max_time_sum,seg_loud_start_avg,seg_loud_start_max,seg_loud_start_min,seg_loud_start_stddev,seg_loud_start_count,seg_loud_start_sum,seg_pitch_avg[0],seg_pitch_max[0],seg_pitch_min[0],seg_pitch_stddev[0],seg_pitch_count[0],seg_pitch_sum[0],seg_pitch_avg[1],seg_pitch_max[1],seg_pitch_min[1],seg_pitch_stddev[1],seg_pitch_count[1],seg_pitch_sum[1],seg_pitch_avg[2],seg_pitch_max[2],seg_pitch_min[2],seg_pitch_stddev[2],seg_pitch_count[2],seg_pitch_sum[2],seg_pitch_avg[3],seg_pitch_max[3],seg_pitch_min[3],seg_pitch_stddev[3],seg_pitch_count[3],seg_pitch_sum[3],seg_pitch_avg[4],seg_pitch_max[4],seg_pitch_min[4],seg_pitch_stddev[4],seg_pitch_count[4],seg_pitch_sum[4],seg_pitch_avg[5],seg_pitch_max[5],seg_pitch_min[5],seg_pitch_stddev[5],seg_pitch_count[5],seg_pitch_sum[5],seg_pitch_avg[6],seg_pitch_max[6],seg_pitch_min[6],seg_pitch_stddev[6],seg_pitch_count[6],seg_pitch_sum[6],seg_pitch_avg[7],seg_pitch_max[7],seg_pitch_min[7],seg_pitch_stddev[7],seg_pitch_count[7],seg_pitch_sum[7],seg_pitch_avg[8],seg_pitch_max[8],seg_pitch_min[8],seg_pitch_stddev[8],seg_pitch_count[8],seg_pitch_sum[8],seg_pitch_avg[9],seg_pitch_max[9],seg_pitch_min[9],seg_pitch_stddev[9],seg_pitch_count[9],seg_pitch_sum[9],seg_pitch_avg[10],seg_pitch_max[10],seg_pitch_min[10],seg_pitch_stddev[10],seg_pitch_count[10],seg_pitch_sum[10],seg_pitch_avg[11],seg_pitch_max[11],seg_pitch_min[11], seg_pitch_stddev[11],seg_pitch_count[11],seg_pitch_sum[11],seg_start_avg,seg_start_max,seg_start_min,seg_start_stddev, seg_start_count,seg_start_sum,seg_timbre_avg[0],seg_timbre_max[0],seg_timbre_min[0],seg_timbre_stddev[0],seg_timbre_count[0], seg_timbre_sum[0],seg_timbre_avg[1],seg_timbre_max[1],seg_timbre_min[1],seg_timbre_stddev[1],seg_timbre_count[1], seg_timbre_sum[1],seg_timbre_avg[2],seg_timbre_max[2],seg_timbre_min[2],seg_timbre_stddev[2],seg_timbre_count[2], seg_timbre_sum[2],seg_timbre_avg[3],seg_timbre_max[3],seg_timbre_min[3],seg_timbre_stddev[3],seg_timbre_count[3], seg_timbre_sum[3],seg_timbre_avg[4],seg_timbre_max[4],seg_timbre_min[4],seg_timbre_stddev[4],seg_timbre_count[4], seg_timbre_sum[4],seg_timbre_avg[5],seg_timbre_max[5],seg_timbre_min[5],seg_timbre_stddev[5],seg_timbre_count[5], seg_timbre_sum[5],seg_timbre_avg[6],seg_timbre_max[6],seg_timbre_min[6],seg_timbre_stddev[6],seg_timbre_count[6], seg_timbre_sum[6],seg_timbre_avg[7],seg_timbre_max[7],seg_timbre_min[7],seg_timbre_stddev[7],seg_timbre_count[7], seg_timbre_sum[7],seg_timbre_avg[8],seg_timbre_max[8],seg_timbre_min[8],seg_timbre_stddev[8],seg_timbre_count[8], seg_timbre_sum[8],seg_timbre_avg[9],seg_timbre_max[9],seg_timbre_min[9],seg_timbre_stddev[9],seg_timbre_count[9], seg_timbre_sum[9],seg_timbre_avg[10],seg_timbre_max[10],seg_timbre_min[10],seg_timbre_stddev[10],seg_timbre_count[10], seg_timbre_sum[10],seg_timbre_avg[11],seg_timbre_max[11],seg_timbre_min[11],seg_timbre_stddev[11],seg_timbre_count[11], seg_timbre_sum[11],tatms_c_avg,tatms_c_max,tatms_c_min,tatms_c_stddev,tatms_c_count,tatms_c_sum,tatms_start_avg,tatms_start_max,tatms_start_min,tatms_start_stddev,tatms_start_count,tatms_start_sum]) h5.close() count=count+1; print count;
song.artistTermsFreq = remove_trap_characters( str(list(hdf5_getters.get_artist_terms_freq(songH5File)))) song.artistTermsWeight = remove_trap_characters( str(list(hdf5_getters.get_artist_terms_weight(songH5File)))) temp = hdf5_getters.get_artist_mbtags(songH5File) song.artistMBTags = remove_trap_characters(str(list(temp))) song.artistMBTagsOuterCount = get_list_length(temp) song.artistMBTagsCount = remove_trap_characters( str(list(hdf5_getters.get_artist_mbtags_count(songH5File)))) song.analysisSampleRate = remove_trap_characters( str(hdf5_getters.get_analysis_sample_rate(songH5File))) song.audioMD5 = remove_trap_characters( str(hdf5_getters.get_audio_md5(songH5File))) song.endOfFadeIn = remove_trap_characters( str(hdf5_getters.get_end_of_fade_in(songH5File))) song.startOfFadeOut = remove_trap_characters( str(hdf5_getters.get_start_of_fade_out(songH5File))) song.energy = remove_trap_characters( str(hdf5_getters.get_energy(songH5File))) song.release = remove_trap_characters( str(hdf5_getters.get_release(songH5File))) song.release7digitalid = remove_trap_characters( str(hdf5_getters.get_release_7digitalid(songH5File))) song.songHotness = remove_trap_characters( str(hdf5_getters.get_song_hotttnesss(songH5File))) song.track7digitalid = remove_trap_characters( str(hdf5_getters.get_track_7digitalid(songH5File))) temp = hdf5_getters.get_similar_artists(songH5File) song.similarartists = remove_trap_characters(str(list(list(temp))))
def data_to_flat_file(basedir, ext='.h5'): """ This function extracts the information from the tables and creates the flat file. """ count = 0 #song counter list_to_write = [] group_index = 0 row_to_write = "" writer = csv.writer(open("complete.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: row = [] print f h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title = title.replace('"', '') row.append(title) comma = title.find(',') if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album = album.replace('"', '') row.append(album) comma = album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma = artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name = artist_name.replace('"', '') row.append(artist_name) duration = hdf5_getters.get_duration(h5) row.append(duration) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) row.append(samp_rt) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) row.append(artist_7digitalid) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam = -1 row.append(artist_fam) artist_hotness = hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness = -1 row.append(artist_hotness) artist_id = hdf5_getters.get_artist_id(h5) row.append(artist_id) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat = -1 row.append(artist_lat) artist_loc = hdf5_getters.get_artist_location(h5) row.append(artist_loc) artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon = -1 row.append(artist_lon) artist_mbid = hdf5_getters.get_artist_mbid(h5) row.append(artist_mbid) #Getting the genre art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes = get_genre_indexes( trm_freq) #index of the highest freq genre_set = 0 #flag to see if the genre has been set or not final_genre = [] genres_so_far = [] for i in range(len(genre_indexes)): genre_tmp = get_genre( art_trm, genre_indexes[i] ) #genre that corresponds to the highest freq genres_so_far = genre_dict.get_genre_in_dict( genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set = 1 if genre_set == 1: col_num = [] for i in final_genre: column = int(i) #getting the column number of the genre col_num.append(column) genre_array = genre_columns(col_num) #genre array for i in range(len( genre_array)): #appending the genre_array to the row row.append(genre_array[i]) else: genre_array = genre_columns( -1 ) #when there is no genre matched, return an array of [0...0] for i in range(len( genre_array)): #appending the genre_array to the row row.append(genre_array[i]) artist_pmid = hdf5_getters.get_artist_playmeid(h5) row.append(artist_pmid) audio_md5 = hdf5_getters.get_audio_md5(h5) row.append(audio_md5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability = -1 row.append(danceability) end_fade_in = hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in = -1 row.append(end_fade_in) energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy = -1 row.append(energy) song_key = hdf5_getters.get_key(h5) row.append(song_key) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c = -1 row.append(key_c) loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness = -1 row.append(loudness) mode = hdf5_getters.get_mode(h5) row.append(mode) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf = -1 row.append(mode_conf) release_7digitalid = hdf5_getters.get_release_7digitalid(h5) row.append(release_7digitalid) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot = -1 row.append(song_hot) song_id = hdf5_getters.get_song_id(h5) row.append(song_id) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) row.append(start_fade_out) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo = -1 row.append(tempo) time_sig = hdf5_getters.get_time_signature(h5) row.append(time_sig) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c = -1 row.append(time_sig_c) track_id = hdf5_getters.get_track_id(h5) row.append(track_id) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) row.append(track_7digitalid) year = hdf5_getters.get_year(h5) row.append(year) bars_c = hdf5_getters.get_bars_confidence(h5) bars_start = hdf5_getters.get_bars_start(h5) row_bars_padding = padding( 245 ) #this is the array that will be attached at the end of th row #--------------bars---------------" gral_info = [] gral_info = row[:] empty = [] for i, item in enumerate(bars_c): row.append(group_index) row.append(i) row.append(bars_c[i]) bars_c_avg = get_avg(bars_c) row.append(bars_c_avg) bars_c_max = get_max(bars_c) row.append(bars_c_max) bars_c_min = get_min(bars_c) row.append(bars_c_min) bars_c_stddev = get_stddev(bars_c) row.append(bars_c_stddev) bars_c_count = get_count(bars_c) row.append(bars_c_count) bars_c_sum = get_sum(bars_c) row.append(bars_c_sum) row.append(bars_start[i]) bars_start_avg = get_avg(bars_start) row.append(bars_start_avg) bars_start_max = get_max(bars_start) row.append(bars_start_max) bars_start_min = get_min(bars_start) row.append(bars_start_min) bars_start_stddev = get_stddev(bars_start) row.append(bars_start_stddev) bars_start_count = get_count(bars_start) row.append(bars_start_count) bars_start_sum = get_sum(bars_start) row.append(bars_start_sum) for i in row_bars_padding: row.append(i) writer.writerow(row) row = [] row = gral_info[:] #--------beats---------------" beats_c = hdf5_getters.get_beats_confidence(h5) group_index = 1 row = [] row = gral_info[:] row_front = padding( 14) #blanks left in front of the row(empty spaces for bars) row_beats_padding = padding(231) for i, item in enumerate(beats_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the beats row.append(index) row.append(beats_c[i]) beats_c_avg = get_avg(beats_c) row.append(beats_c_avg) beats_c_max = get_max(beats_c) row.append(beats_c_max) beats_c_min = get_min(beats_c) row.append(beats_c_min) beats_c_stddev = get_stddev(beats_c) row.append(beats_c_stddev) beats_c_count = get_count(beats_c) row.append(beats_c_count) beats_c_sum = get_sum(beats_c) row.append(beats_c_sum) beats_start = hdf5_getters.get_beats_start(h5) row.append(beats_start[i]) beats_start_avg = get_avg(beats_start) row.append(beats_start_avg) beats_start_max = get_max(beats_start) row.append(beats_start_max) beats_start_min = get_min(beats_start) row.append(beats_start_min) beats_start_stddev = get_stddev(beats_start) row.append(beats_start_stddev) beats_start_count = get_count(beats_start) row.append(beats_start_count) beats_start_sum = get_sum(beats_start) row.append(beats_start_sum) for i in row_beats_padding: row.append(i) writer.writerow(row) row = [] row = gral_info[:] # "--------sections---------------" row_sec_padding = padding( 217) #blank spaces left at the end of the row sec_c = hdf5_getters.get_sections_confidence(h5) group_index = 2 row = [] row = gral_info[:] row_front = padding( 28) #blank spaces left in front(empty spaces for bars,beats) for i, item in enumerate(sec_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the sections row.append(index) row.append(sec_c[i]) sec_c_avg = get_avg(sec_c) row.append(sec_c_avg) sec_c_max = get_max(sec_c) row.append(sec_c_max) sec_c_min = get_min(sec_c) row.append(sec_c_min) sec_c_stddev = get_stddev(sec_c) row.append(sec_c_stddev) sec_c_count = get_count(sec_c) row.append(sec_c_count) sec_c_sum = get_sum(sec_c) row.append(sec_c_sum) sec_start = hdf5_getters.get_sections_start(h5) row.append(sec_start[i]) sec_start_avg = get_avg(sec_start) row.append(sec_start_avg) sec_start_max = get_max(sec_start) row.append(sec_start_max) sec_start_min = get_min(sec_start) row.append(sec_start_min) sec_start_stddev = get_stddev(sec_start) row.append(sec_start_stddev) sec_start_count = get_count(sec_start) row.append(sec_start_count) sec_start_sum = get_sum(sec_start) row.append(sec_start_sum) for i in row_sec_padding: #appending the blank spaces at the end of the row row.append(i) writer.writerow(row) row = [] row = gral_info[:] #--------segments---------------" row_seg_padding = padding(182) #blank spaces at the end of the row row_front = padding(42) #blank spaces left in front of segments seg_c = hdf5_getters.get_segments_confidence(h5) group_index = 3 row = [] row = gral_info[:] for i, item in enumerate(seg_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the segments row.append(index) row.append(seg_c[i]) seg_c_avg = get_avg(seg_c) row.append(seg_c_avg) seg_c_max = get_max(seg_c) row.append(seg_c_max) seg_c_min = get_min(seg_c) row.append(seg_c_min) seg_c_stddev = get_stddev(seg_c) row.append(seg_c_stddev) seg_c_count = get_count(seg_c) row.append(seg_c_count) seg_c_sum = get_sum(seg_c) row.append(seg_c_sum) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) row.append(seg_loud_max[i]) seg_loud_max_avg = get_avg(seg_loud_max) row.append(seg_loud_max_avg) seg_loud_max_max = get_max(seg_loud_max) row.append(seg_loud_max_max) seg_loud_max_min = get_min(seg_loud_max) row.append(seg_loud_max_min) seg_loud_max_stddev = get_stddev(seg_loud_max) row.append(seg_loud_max_stddev) seg_loud_max_count = get_count(seg_loud_max) row.append(seg_loud_max_count) seg_loud_max_sum = get_sum(seg_loud_max) row.append(seg_loud_max_sum) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time( h5) row.append(seg_loud_max_time[i]) seg_loud_max_time_avg = get_avg(seg_loud_max_time) row.append(seg_loud_max_time_avg) seg_loud_max_time_max = get_max(seg_loud_max_time) row.append(seg_loud_max_time_max) seg_loud_max_time_min = get_min(seg_loud_max_time) row.append(seg_loud_max_time_min) seg_loud_max_time_stddev = get_stddev(seg_loud_max_time) row.append(seg_loud_max_time_stddev) seg_loud_max_time_count = get_count(seg_loud_max_time) row.append(seg_loud_max_time_count) seg_loud_max_time_sum = get_sum(seg_loud_max_time) row.append(seg_loud_max_time_sum) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) row.append(seg_loud_start[i]) seg_loud_start_avg = get_avg(seg_loud_start) row.append(seg_loud_start_avg) seg_loud_start_max = get_max(seg_loud_start) row.append(seg_loud_start_max) seg_loud_start_min = get_min(seg_loud_start) row.append(seg_loud_start_min) seg_loud_start_stddev = get_stddev(seg_loud_start) row.append(seg_loud_start_stddev) seg_loud_start_count = get_count(seg_loud_start) row.append(seg_loud_start_count) seg_loud_start_sum = get_sum(seg_loud_start) row.append(seg_loud_start_sum) seg_start = hdf5_getters.get_segments_start(h5) row.append(seg_start[i]) seg_start_avg = get_avg(seg_start) row.append(seg_start_avg) seg_start_max = get_max(seg_start) row.append(seg_start_max) seg_start_min = get_min(seg_start) row.append(seg_start_min) seg_start_stddev = get_stddev(seg_start) row.append(seg_start_stddev) seg_start_count = get_count(seg_start) row.append(seg_start_count) seg_start_sum = get_sum(seg_start) row.append(seg_start_sum) for i in row_seg_padding: #appending blank spaces at the end of the row row.append(i) writer.writerow(row) row = [] row = gral_info[:] #----------segments pitch and timbre---------------" row_seg2_padding = padding( 14) #blank spaces left at the end of the row row_front = padding( 77) #blank spaces left at the front of the segments and timbre seg_pitch = hdf5_getters.get_segments_pitches(h5) transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows group_index = 4 row = [] row = gral_info[:] for i, item in enumerate(transpose_pitch[0]): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of segments and timbre row.append(index) row.append(transpose_pitch[0][i]) seg_pitch_avg = get_avg(transpose_pitch[0]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[0]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[0]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[0]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[0]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[0]) row.append(seg_pitch_sum) row.append(transpose_pitch[1][i]) seg_pitch_avg = get_avg(transpose_pitch[1]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[1]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[1]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[1]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[1]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[1]) row.append(seg_pitch_sum) row.append(transpose_pitch[2][i]) seg_pitch_avg = get_avg(transpose_pitch[2]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[2]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[2]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[2]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[2]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[2]) row.append(seg_pitch_sum) row.append(transpose_pitch[3][i]) seg_pitch_avg = get_avg(transpose_pitch[3]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[3]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[3]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[3]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[3]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[3]) row.append(seg_pitch_sum) row.append(transpose_pitch[4][i]) seg_pitch_avg = get_avg(transpose_pitch[4]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[4]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[4]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[4]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[4]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[4]) row.append(seg_pitch_sum) row.append(transpose_pitch[5][i]) seg_pitch_avg = get_avg(transpose_pitch[5]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[5]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[5]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[5]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[5]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[5]) row.append(seg_pitch_sum) row.append(transpose_pitch[6][i]) seg_pitch_avg = get_avg(transpose_pitch[6]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[6]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[6]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[6]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[6]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[6]) row.append(seg_pitch_sum) row.append(transpose_pitch[7][i]) seg_pitch_avg = get_avg(transpose_pitch[7]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[7]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[7]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[7]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[7]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[7]) row.append(seg_pitch_sum) row.append(transpose_pitch[8][i]) seg_pitch_avg = get_avg(transpose_pitch[8]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[8]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[8]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[8]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[8]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[8]) row.append(seg_pitch_sum) row.append(transpose_pitch[9][i]) seg_pitch_avg = get_avg(transpose_pitch[9]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[9]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[9]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[9]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[9]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[9]) row.append(seg_pitch_sum) row.append(transpose_pitch[10][i]) seg_pitch_avg = get_avg(transpose_pitch[10]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[10]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[10]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[10]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[10]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[10]) row.append(seg_pitch_sum) row.append(transpose_pitch[11][i]) seg_pitch_avg = get_avg(transpose_pitch[11]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[11]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[11]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[11]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[11]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[11]) row.append(seg_pitch_sum) #timbre arrays seg_timbre = hdf5_getters.get_segments_timbre(h5) transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows row.append(transpose_timbre[0][i]) seg_timbre_avg = get_avg(transpose_timbre[0]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[0]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[0]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[0]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[0]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[0]) row.append(seg_timbre_sum) row.append(transpose_timbre[1][i]) seg_timbre_avg = get_avg(transpose_timbre[1]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[1]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[1]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[1]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[1]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[1]) row.append(seg_timbre_sum) row.append(transpose_timbre[2][i]) seg_timbre_avg = get_avg(transpose_timbre[2]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[2]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[2]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[2]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[2]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[2]) row.append(seg_timbre_sum) row.append(transpose_timbre[3][i]) seg_timbre_avg = get_avg(transpose_timbre[3]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[3]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[3]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[3]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[3]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[3]) row.append(seg_timbre_sum) row.append(transpose_timbre[4][i]) seg_timbre_avg = get_avg(transpose_timbre[4]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[4]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[4]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[4]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[4]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[4]) row.append(seg_timbre_sum) row.append(transpose_timbre[5][i]) seg_timbre_avg = get_avg(transpose_timbre[5]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[5]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[5]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[5]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[5]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[5]) row.append(seg_timbre_sum) row.append(transpose_timbre[6][i]) seg_timbre_avg = get_avg(transpose_timbre[6]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[6]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[6]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[6]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[6]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[6]) row.append(seg_timbre_sum) row.append(transpose_timbre[7][i]) seg_timbre_avg = get_avg(transpose_timbre[7]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[7]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[7]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[7]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[7]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[7]) row.append(seg_timbre_sum) row.append(transpose_timbre[8][i]) seg_timbre_avg = get_avg(transpose_timbre[8]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[8]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[8]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[8]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[8]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[8]) row.append(seg_timbre_sum) row.append(transpose_timbre[9][i]) seg_timbre_avg = get_avg(transpose_timbre[9]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[9]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[9]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[9]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[9]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[9]) row.append(seg_timbre_sum) row.append(transpose_timbre[10][i]) seg_timbre_avg = get_avg(transpose_timbre[10]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[10]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[10]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[10]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[10]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[10]) row.append(seg_timbre_sum) row.append(transpose_timbre[11][i]) seg_timbre_avg = get_avg(transpose_timbre[11]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[11]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[11]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[11]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[11]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[11]) row.append(seg_timbre_sum) for item in row_seg2_padding: row.append(item) writer.writerow(row) row = [] row = gral_info[:] # "--------tatums---------------" tatms_c = hdf5_getters.get_tatums_confidence(h5) group_index = 5 row_front = padding(245) #blank spaces left in front of tatums row = [] row = gral_info[:] for i, item in enumerate(tatms_c): row.append(group_index) row.append(i) for item in row_front: #appending blank spaces at the front of the row row.append(item) row.append(tatms_c[i]) tatms_c_avg = get_avg(tatms_c) row.append(tatms_c_avg) tatms_c_max = get_max(tatms_c) row.append(tatms_c_max) tatms_c_min = get_min(tatms_c) row.append(tatms_c_min) tatms_c_stddev = get_stddev(tatms_c) row.append(tatms_c_stddev) tatms_c_count = get_count(tatms_c) row.append(tatms_c_count) tatms_c_sum = get_sum(tatms_c) row.append(tatms_c_sum) tatms_start = hdf5_getters.get_tatums_start(h5) row.append(tatms_start[i]) tatms_start_avg = get_avg(tatms_start) row.append(tatms_start_avg) tatms_start_max = get_max(tatms_start) row.append(tatms_start_max) tatms_start_min = get_min(tatms_start) row.append(tatms_start_min) tatms_start_stddev = get_stddev(tatms_start) row.append(tatms_start_stddev) tatms_start_count = get_count(tatms_start) row.append(tatms_start_count) tatms_start_sum = get_sum(tatms_start) row.append(tatms_start_sum) writer.writerow(row) row = [] row = gral_info[:] transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg = [] seg_pitch_max = [] seg_pitch_min = [] seg_pitch_stddev = [] seg_pitch_count = [] seg_pitch_sum = [] i = 0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i = i + 1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg = [] seg_timbre_max = [] seg_timbre_min = [] seg_timbre_stddev = [] seg_timbre_count = [] seg_timbre_sum = [] i = 0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i = i + 1 h5.close() count = count + 1 print count
def get_all_rows(basedir, ext='.h5'): rows = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: # print(os.path.join(root, f)) h5 = hdf5_getters.open_h5_file_read(f) num_songs = hdf5_getters.get_num_songs(h5) # print(num_songs) for i in range(num_songs): print(i) obj = {} obj['artist_name'] = hdf5_getters.get_artist_name( h5, i).decode('UTF-8') obj['artist_familiarity'] = hdf5_getters.get_artist_familiarity( h5, i) obj['artist_hotness'] = hdf5_getters.get_artist_hotttnesss( h5, i) obj['artist_id'] = hdf5_getters.get_artist_id( h5, i).decode('UTF-8') # obj['artist_mbid']=hdf5_getters.get_artist_mbid(h5,i).decode('UTF-8') obj['artist_playmeid'] = hdf5_getters.get_artist_playmeid( h5, i) obj['artist_7digitalid'] = hdf5_getters.get_artist_7digitalid( h5, i) # obj['artist_latitude']=hdf5_getters.get_artist_latitude(h5,i) # obj['artist_longitude']=hdf5_getters.get_artist_longitude(h5,i) # obj['artist_location']=hdf5_getters.get_artist_location(h5,i).decode('UTF-8') obj['artist_name'] = hdf5_getters.get_artist_name( h5, i).decode('UTF-8') obj['release'] = hdf5_getters.get_release(h5, i).decode('UTF-8') obj['song_hotttnesss'] = hdf5_getters.get_song_hotttnesss( h5, i) obj['title'] = hdf5_getters.get_title(h5, i).decode('UTF-8') # obj['artist_terms']=hdf5_getters.get_artist_terms(h5) # obj['artist_terms_freq']=hdf5_getters.get_artist_terms_freq(h5) # obj['artist_terms_weight']=hdf5_getters.get_artist_terms_weight(h5) # obj['audio_md5']=hdf5_getters.get_audio_md5(h5).decode('UTF-8') obj['danceability'] = hdf5_getters.get_danceability(h5, i) obj['duration'] = hdf5_getters.get_duration(h5, i) obj['end_of_fade_in'] = hdf5_getters.get_end_of_fade_in(h5, i) obj['energy'] = hdf5_getters.get_energy(h5, i) obj['key'] = hdf5_getters.get_key(h5, i) obj['key_confidence'] = hdf5_getters.get_key_confidence(h5, i) obj['loudness'] = hdf5_getters.get_loudness(h5, i) obj['mode'] = hdf5_getters.get_mode(h5, i) # obj['start_of_fade_out']=hdf5_getters.get_start_of_fade_out(h5) obj['tempo'] = hdf5_getters.get_tempo(h5, i) obj['time_signature'] = hdf5_getters.get_time_signature(h5, i) # obj['time_signature_confidence']=hdf5_getters.get_time_signature_confidence(h5) obj['track_id'] = hdf5_getters.get_track_id(h5, i).decode('UTF-8') # obj['segments_start']=hdf5_getters.get_segments_start(h5) # obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5) # obj['segments_pitches']=hdf5_getters.get_segments_pitches(h5) # obj['segments_timbre']=hdf5_getters.get_segments_timbre(h5) # obj['segments_loudness_max']=hdf5_getters.get_segments_loudness_max(h5) # obj['segments_loudness_max_time']=hdf5_getters.get_segments_loudness_max_time(h5) # obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5) # obj['segments_loudness_start']=hdf5_getters.get_segments_loudness_start(h5) # obj['sections_start']=hdf5_getters.get_sections_start(h5) # obj['sections_confidence']=hdf5_getters.get_sections_confidence(h5) # obj['beats_start']=hdf5_getters.get_beats_start(h5) # obj['beats_confidence']=hdf5_getters.get_beats_confidence(h5) # obj['bars_start']=hdf5_getters.get_bars_start(h5) # obj['bars_confidence']=hdf5_getters.get_bars_confidence(h5) # obj['tatums_start']=hdf5_getters.get_tatums_start(h5) # obj['artist_mbtags']=hdf5_getters.get_artist_mbtags(h5) # obj['artist_mbtags_count']=hdf5_getters.get_artist_mbtags_count(h5) obj['year'] = hdf5_getters.get_year(h5, i) rows.append(obj) h5.close() return rows
def data_to_flat_file(basedir,ext='.h5') : """ This function extracts the information from the tables and creates the flat file. """ count = 0; #song counter list_to_write= [] group_index=0 row_to_write = "" writer = csv.writer(open("complete.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: row=[] print f h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') row.append(title) comma=title.find(',') if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') row.append(album) comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') row.append(artist_name) duration = hdf5_getters.get_duration(h5) row.append(duration) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) row.append(samp_rt) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) row.append(artist_7digitalid) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 row.append(artist_fam) artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 row.append(artist_hotness) artist_id = hdf5_getters.get_artist_id(h5) row.append(artist_id) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 row.append(artist_lat) artist_loc = hdf5_getters.get_artist_location(h5) row.append(artist_loc) artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 row.append(artist_lon) artist_mbid = hdf5_getters.get_artist_mbid(h5) row.append(artist_mbid) #Getting the genre art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq genre_set=0 #flag to see if the genre has been set or not final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 if genre_set == 1: col_num=[] for i in final_genre: column=int(i) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) else: genre_array=genre_columns(-1) #when there is no genre matched, return an array of [0...0] for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) artist_pmid = hdf5_getters.get_artist_playmeid(h5) row.append(artist_pmid) audio_md5 = hdf5_getters.get_audio_md5(h5) row.append(audio_md5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 row.append(danceability) end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 row.append(end_fade_in) energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 row.append(energy) song_key = hdf5_getters.get_key(h5) row.append(song_key) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 row.append(key_c) loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 row.append(loudness) mode = hdf5_getters.get_mode(h5) row.append(mode) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 row.append(mode_conf) release_7digitalid = hdf5_getters.get_release_7digitalid(h5) row.append(release_7digitalid) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 row.append(song_hot) song_id = hdf5_getters.get_song_id(h5) row.append(song_id) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) row.append(start_fade_out) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 row.append(tempo) time_sig = hdf5_getters.get_time_signature(h5) row.append(time_sig) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 row.append(time_sig_c) track_id = hdf5_getters.get_track_id(h5) row.append(track_id) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) row.append(track_7digitalid) year = hdf5_getters.get_year(h5) row.append(year) bars_c = hdf5_getters.get_bars_confidence(h5) bars_start = hdf5_getters.get_bars_start(h5) row_bars_padding=padding(245) #this is the array that will be attached at the end of th row #--------------bars---------------" gral_info=[] gral_info=row[:] empty=[] for i,item in enumerate(bars_c): row.append(group_index) row.append(i) row.append(bars_c[i]) bars_c_avg= get_avg(bars_c) row.append(bars_c_avg) bars_c_max= get_max(bars_c) row.append(bars_c_max) bars_c_min = get_min(bars_c) row.append(bars_c_min) bars_c_stddev= get_stddev(bars_c) row.append(bars_c_stddev) bars_c_count = get_count(bars_c) row.append(bars_c_count) bars_c_sum = get_sum(bars_c) row.append(bars_c_sum) row.append(bars_start[i]) bars_start_avg = get_avg(bars_start) row.append(bars_start_avg) bars_start_max= get_max(bars_start) row.append(bars_start_max) bars_start_min = get_min(bars_start) row.append(bars_start_min) bars_start_stddev= get_stddev(bars_start) row.append(bars_start_stddev) bars_start_count = get_count(bars_start) row.append(bars_start_count) bars_start_sum = get_sum(bars_start) row.append(bars_start_sum) for i in row_bars_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------beats---------------" beats_c = hdf5_getters.get_beats_confidence(h5) group_index=1 row=[] row=gral_info[:] row_front=padding(14) #blanks left in front of the row(empty spaces for bars) row_beats_padding=padding(231) for i,item in enumerate(beats_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the beats row.append(index) row.append(beats_c[i]) beats_c_avg= get_avg(beats_c) row.append(beats_c_avg) beats_c_max= get_max(beats_c) row.append(beats_c_max) beats_c_min = get_min(beats_c) row.append(beats_c_min) beats_c_stddev= get_stddev(beats_c) row.append(beats_c_stddev) beats_c_count = get_count(beats_c) row.append(beats_c_count) beats_c_sum = get_sum(beats_c) row.append(beats_c_sum) beats_start = hdf5_getters.get_beats_start(h5) row.append(beats_start[i]) beats_start_avg = get_avg(beats_start) row.append(beats_start_avg) beats_start_max= get_max(beats_start) row.append(beats_start_max) beats_start_min = get_min(beats_start) row.append(beats_start_min) beats_start_stddev= get_stddev(beats_start) row.append(beats_start_stddev) beats_start_count = get_count(beats_start) row.append(beats_start_count) beats_start_sum = get_sum(beats_start) row.append(beats_start_sum) for i in row_beats_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] # "--------sections---------------" row_sec_padding=padding(217) #blank spaces left at the end of the row sec_c = hdf5_getters.get_sections_confidence(h5) group_index=2 row=[] row=gral_info[:] row_front=padding(28) #blank spaces left in front(empty spaces for bars,beats) for i,item in enumerate(sec_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the sections row.append(index) row.append(sec_c[i]) sec_c_avg= get_avg(sec_c) row.append(sec_c_avg) sec_c_max= get_max(sec_c) row.append(sec_c_max) sec_c_min = get_min(sec_c) row.append(sec_c_min) sec_c_stddev= get_stddev(sec_c) row.append(sec_c_stddev) sec_c_count = get_count(sec_c) row.append(sec_c_count) sec_c_sum = get_sum(sec_c) row.append(sec_c_sum) sec_start = hdf5_getters.get_sections_start(h5) row.append(sec_start[i]) sec_start_avg = get_avg(sec_start) row.append(sec_start_avg) sec_start_max= get_max(sec_start) row.append(sec_start_max) sec_start_min = get_min(sec_start) row.append(sec_start_min) sec_start_stddev= get_stddev(sec_start) row.append(sec_start_stddev) sec_start_count = get_count(sec_start) row.append(sec_start_count) sec_start_sum = get_sum(sec_start) row.append(sec_start_sum) for i in row_sec_padding: #appending the blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------segments---------------" row_seg_padding=padding(182) #blank spaces at the end of the row row_front=padding(42) #blank spaces left in front of segments seg_c = hdf5_getters.get_segments_confidence(h5) group_index=3 row=[] row=gral_info[:] for i,item in enumerate(seg_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the segments row.append(index) row.append(seg_c[i]) seg_c_avg= get_avg(seg_c) row.append(seg_c_avg) seg_c_max= get_max(seg_c) row.append(seg_c_max) seg_c_min = get_min(seg_c) row.append(seg_c_min) seg_c_stddev= get_stddev(seg_c) row.append(seg_c_stddev) seg_c_count = get_count(seg_c) row.append(seg_c_count) seg_c_sum = get_sum(seg_c) row.append(seg_c_sum) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) row.append(seg_loud_max[i]) seg_loud_max_avg= get_avg(seg_loud_max) row.append(seg_loud_max_avg) seg_loud_max_max= get_max(seg_loud_max) row.append(seg_loud_max_max) seg_loud_max_min = get_min(seg_loud_max) row.append(seg_loud_max_min) seg_loud_max_stddev= get_stddev(seg_loud_max) row.append(seg_loud_max_stddev) seg_loud_max_count = get_count(seg_loud_max) row.append(seg_loud_max_count) seg_loud_max_sum = get_sum(seg_loud_max) row.append(seg_loud_max_sum) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) row.append(seg_loud_max_time[i]) seg_loud_max_time_avg= get_avg(seg_loud_max_time) row.append(seg_loud_max_time_avg) seg_loud_max_time_max= get_max(seg_loud_max_time) row.append(seg_loud_max_time_max) seg_loud_max_time_min = get_min(seg_loud_max_time) row.append(seg_loud_max_time_min) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) row.append(seg_loud_max_time_stddev) seg_loud_max_time_count = get_count(seg_loud_max_time) row.append(seg_loud_max_time_count) seg_loud_max_time_sum = get_sum(seg_loud_max_time) row.append(seg_loud_max_time_sum) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) row.append(seg_loud_start[i]) seg_loud_start_avg= get_avg(seg_loud_start) row.append(seg_loud_start_avg) seg_loud_start_max= get_max(seg_loud_start) row.append(seg_loud_start_max) seg_loud_start_min = get_min(seg_loud_start) row.append(seg_loud_start_min) seg_loud_start_stddev= get_stddev(seg_loud_start) row.append(seg_loud_start_stddev) seg_loud_start_count = get_count(seg_loud_start) row.append(seg_loud_start_count) seg_loud_start_sum = get_sum(seg_loud_start) row.append(seg_loud_start_sum) seg_start = hdf5_getters.get_segments_start(h5) row.append(seg_start[i]) seg_start_avg= get_avg(seg_start) row.append(seg_start_avg) seg_start_max= get_max(seg_start) row.append(seg_start_max) seg_start_min = get_min(seg_start) row.append(seg_start_min) seg_start_stddev= get_stddev(seg_start) row.append(seg_start_stddev) seg_start_count = get_count(seg_start) row.append(seg_start_count) seg_start_sum = get_sum(seg_start) row.append(seg_start_sum) for i in row_seg_padding: #appending blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #----------segments pitch and timbre---------------" row_seg2_padding=padding(14) #blank spaces left at the end of the row row_front=padding(77) #blank spaces left at the front of the segments and timbre seg_pitch = hdf5_getters.get_segments_pitches(h5) transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows group_index=4 row=[] row=gral_info[:] for i,item in enumerate(transpose_pitch[0]): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of segments and timbre row.append(index) row.append(transpose_pitch[0][i]) seg_pitch_avg= get_avg(transpose_pitch[0]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[0]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[0]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[0]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[0]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[0]) row.append(seg_pitch_sum) row.append(transpose_pitch[1][i]) seg_pitch_avg= get_avg(transpose_pitch[1]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[1]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[1]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[1]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[1]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[1]) row.append(seg_pitch_sum) row.append(transpose_pitch[2][i]) seg_pitch_avg= get_avg(transpose_pitch[2]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[2]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[2]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[2]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[2]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[2]) row.append(seg_pitch_sum) row.append(transpose_pitch[3][i]) seg_pitch_avg= get_avg(transpose_pitch[3]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[3]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[3]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[3]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[3]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[3]) row.append(seg_pitch_sum) row.append(transpose_pitch[4][i]) seg_pitch_avg= get_avg(transpose_pitch[4]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[4]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[4]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[4]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[4]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[4]) row.append(seg_pitch_sum) row.append(transpose_pitch[5][i]) seg_pitch_avg= get_avg(transpose_pitch[5]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[5]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[5]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[5]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[5]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[5]) row.append(seg_pitch_sum) row.append(transpose_pitch[6][i]) seg_pitch_avg= get_avg(transpose_pitch[6]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[6]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[6]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[6]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[6]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[6]) row.append(seg_pitch_sum) row.append(transpose_pitch[7][i]) seg_pitch_avg= get_avg(transpose_pitch[7]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[7]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[7]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[7]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[7]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[7]) row.append(seg_pitch_sum) row.append(transpose_pitch[8][i]) seg_pitch_avg= get_avg(transpose_pitch[8]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[8]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[8]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[8]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[8]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[8]) row.append(seg_pitch_sum) row.append(transpose_pitch[9][i]) seg_pitch_avg= get_avg(transpose_pitch[9]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[9]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[9]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[9]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[9]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[9]) row.append(seg_pitch_sum) row.append(transpose_pitch[10][i]) seg_pitch_avg= get_avg(transpose_pitch[10]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[10]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[10]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[10]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[10]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[10]) row.append(seg_pitch_sum) row.append(transpose_pitch[11][i]) seg_pitch_avg= get_avg(transpose_pitch[11]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[11]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[11]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[11]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[11]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[11]) row.append(seg_pitch_sum) #timbre arrays seg_timbre = hdf5_getters.get_segments_timbre(h5) transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows row.append(transpose_timbre[0][i]) seg_timbre_avg= get_avg(transpose_timbre[0]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[0]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[0]) row.append(seg_timbre_min) seg_timbre_stddev=get_stddev(transpose_timbre[0]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[0]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[0]) row.append(seg_timbre_sum) row.append(transpose_timbre[1][i]) seg_timbre_avg= get_avg(transpose_timbre[1]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[1]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[1]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[1]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[1]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[1]) row.append(seg_timbre_sum) row.append(transpose_timbre[2][i]) seg_timbre_avg= get_avg(transpose_timbre[2]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[2]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[2]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[2]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[2]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[2]) row.append(seg_timbre_sum) row.append(transpose_timbre[3][i]) seg_timbre_avg= get_avg(transpose_timbre[3]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[3]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[3]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[3]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[3]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[3]) row.append(seg_timbre_sum) row.append(transpose_timbre[4][i]) seg_timbre_avg= get_avg(transpose_timbre[4]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[4]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[4]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[4]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[4]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[4]) row.append(seg_timbre_sum) row.append(transpose_timbre[5][i]) seg_timbre_avg= get_avg(transpose_timbre[5]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[5]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[5]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[5]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[5]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[5]) row.append(seg_timbre_sum) row.append(transpose_timbre[6][i]) seg_timbre_avg= get_avg(transpose_timbre[6]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[6]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[6]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[6]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[6]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[6]) row.append(seg_timbre_sum) row.append(transpose_timbre[7][i]) seg_timbre_avg= get_avg(transpose_timbre[7]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[7]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[7]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[7]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[7]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[7]) row.append(seg_timbre_sum) row.append(transpose_timbre[8][i]) seg_timbre_avg= get_avg(transpose_timbre[8]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[8]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[8]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[8]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[8]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[8]) row.append(seg_timbre_sum) row.append(transpose_timbre[9][i]) seg_timbre_avg= get_avg(transpose_timbre[9]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[9]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[9]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[9]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[9]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[9]) row.append(seg_timbre_sum) row.append(transpose_timbre[10][i]) seg_timbre_avg= get_avg(transpose_timbre[10]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[10]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[10]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[10]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[10]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[10]) row.append(seg_timbre_sum) row.append(transpose_timbre[11][i]) seg_timbre_avg= get_avg(transpose_timbre[11]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[11]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[11]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[11]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[11]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[11]) row.append(seg_timbre_sum) for item in row_seg2_padding: row.append(item) writer.writerow(row) row=[] row=gral_info[:] # "--------tatums---------------" tatms_c = hdf5_getters.get_tatums_confidence(h5) group_index=5 row_front=padding(245) #blank spaces left in front of tatums row=[] row=gral_info[:] for i,item in enumerate(tatms_c): row.append(group_index) row.append(i) for item in row_front: #appending blank spaces at the front of the row row.append(item) row.append(tatms_c[i]) tatms_c_avg= get_avg(tatms_c) row.append(tatms_c_avg) tatms_c_max= get_max(tatms_c) row.append(tatms_c_max) tatms_c_min = get_min(tatms_c) row.append(tatms_c_min) tatms_c_stddev= get_stddev(tatms_c) row.append(tatms_c_stddev) tatms_c_count = get_count(tatms_c) row.append(tatms_c_count) tatms_c_sum = get_sum(tatms_c) row.append(tatms_c_sum) tatms_start = hdf5_getters.get_tatums_start(h5) row.append(tatms_start[i]) tatms_start_avg= get_avg(tatms_start) row.append(tatms_start_avg) tatms_start_max= get_max(tatms_start) row.append(tatms_start_max) tatms_start_min = get_min(tatms_start) row.append(tatms_start_min) tatms_start_stddev= get_stddev(tatms_start) row.append(tatms_start_stddev) tatms_start_count = get_count(tatms_start) row.append(tatms_start_count) tatms_start_sum = get_sum(tatms_start) row.append(tatms_start_sum) writer.writerow(row) row=[] row=gral_info[:] transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 h5.close() count=count+1; print count;
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo]) h5.close() count=count+1; print count;
def main(argv): if len(argv) != 1: print "Specify data directory" return basedir = argv[0] outputFile1 = open('SongCSV.csv', 'w') outputFile2 = open('TagsCSV.csv', 'w') csvRowString = "" csvLabelString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input("\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"+ " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString); csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) #csvRowString = ("SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,"+ # "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,"+ # "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"+ # "Title,Year") csvRowString = ("ArtistFamiliarity,ArtistHotttnesss,"+ "BarsConfidence,BarsStart,BeatsConfidence,BeatsStart,Duration,"+ "EndOfFadeIn,Key,KeyConfidence,Loudness,Mode,ModeConfidence,"+ "SectionsConfidence,SectionsStart,SegmentsConfidence,SegmentsLoudnessMax,"+ "SegmentsLoudnessMaxTime,SegmentsLoudnessStart,SegmentsStart,"+ "SongHotttnesss,StartOfFadeOut,TatumsConfidence,TatumsStart,Tempo,TimeSignature,TimeSignatureConfidence,"+ "SegmentsPitches,SegmentsTimbre,Title,Year,Decade,ArtistMbtags") ################################################# header = str() csvAttributeList = re.split('\W+', csvRowString) arrayAttributes = ["BarsConfidence","BarsStart","BeatsConfidence","BeatsStart", "SectionsConfidence","SectionsStart","SegmentsConfidence","SegmentsLoudnessMax", "SegmentsLoudnessMaxTime","SegmentsLoudnessStart","SegmentsStart", "TatumsConfidence","TatumsStart"] for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() if(v=="SegmentsPitches"): for i in range(90): header = header + "SegmentsPitches" + str(i) + "," elif(v=="SegmentsTimbre"): for i in range(90): header = header + "SegmentsTimbre" + str(i) + "," elif(v in arrayAttributes): header = header + v + str(0) + "," header = header + v + str(1) + "," else: header = header + v + "," outputFile1.write("SongNumber,"); #outputFile1.write(csvRowString + "\n"); outputFile1.write(header + "\n"); csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate #basedir = "MillionSongSubset/data/A/A/" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP all = sorted(os.walk(basedir)) for root, dirs, files in all: files = sorted(glob.glob(os.path.join(root,'*'+ext))) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) #testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.analysisSampleRate = str(hdf5_getters.get_analysis_sample_rate(songH5File)) song.artistFamiliarity = str(hdf5_getters.get_artist_familiarity(songH5File)) song.artistHotttnesss = str(hdf5_getters.get_artist_hotttnesss(songH5File)) song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File)) song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File)) song.artistMbid = str(hdf5_getters.get_artist_mbid(songH5File)) song.barsConfidence = np.array(hdf5_getters.get_bars_confidence(songH5File)) song.barsStart = np.array(hdf5_getters.get_bars_start(songH5File)) song.beatsConfidence = np.array(hdf5_getters.get_beats_confidence(songH5File)) song.beatsStart = np.array(hdf5_getters.get_beats_start(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) song.endOfFadeIn = str(hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.keyConfidence = str(hdf5_getters.get_key_confidence(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.modeConfidence = str(hdf5_getters.get_mode_confidence(songH5File)) song.sectionsConfidence = np.array(hdf5_getters.get_sections_confidence(songH5File)) song.sectionsStart = np.array(hdf5_getters.get_sections_start(songH5File)) song.segmentsConfidence = np.array(hdf5_getters.get_segments_confidence(songH5File)) song.segmentsLoudnessMax = np.array(hdf5_getters.get_segments_loudness_max(songH5File)) song.segmentsLoudnessMaxTime = np.array(hdf5_getters.get_segments_loudness_max_time(songH5File)) song.segmentsLoudnessStart = np.array(hdf5_getters.get_segments_loudness_start(songH5File)) song.segmentsPitches = np.array(hdf5_getters.get_segments_pitches(songH5File)) song.segmentsStart = np.array(hdf5_getters.get_segments_start(songH5File)) song.segmentsTimbre = np.array(hdf5_getters.get_segments_timbre(songH5File)) song.songHotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File)) song.startOfFadeOut = str(hdf5_getters.get_start_of_fade_out(songH5File)) song.tatumsConfidence = np.array(hdf5_getters.get_tatums_confidence(songH5File)) song.tatumsStart = np.array(hdf5_getters.get_tatums_start(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str(hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str(hdf5_getters.get_time_signature_confidence(songH5File)) song.songid = str(hdf5_getters.get_song_id(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) song.artistMbtags = str(hdf5_getters.get_artist_mbtags(songH5File)) #print song count csvRowString += str(song.songCount) + "," csvLabelString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AnalysisSampleRate'.lower(): csvRowString += song.analysisSampleRate elif attribute == 'ArtistFamiliarity'.lower(): csvRowString += song.artistFamiliarity elif attribute == 'ArtistHotttnesss'.lower(): csvRowString += song.artistHotttnesss elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistMbid'.lower(): csvRowString += song.artistMbid elif attribute == 'BarsConfidence'.lower(): arr = song.barsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'BarsStart'.lower(): arr = song.barsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'BeatsConfidence'.lower(): arr = song.beatsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'BeatsStart'.lower(): arr = song.beatsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'EndOfFadeIn'.lower(): csvRowString += song.endOfFadeIn elif attribute == 'Energy'.lower(): csvRowString += song.energy elif attribute == 'Key'.lower(): csvRowString += song.key elif attribute == 'KeyConfidence'.lower(): csvRowString += song.keyConfidence elif attribute == 'Loudness'.lower(): csvRowString += song.loudness elif attribute == 'Mode'.lower(): csvRowString += song.mode elif attribute == 'ModeConfidence'.lower(): csvRowString += song.modeConfidence elif attribute == 'SectionsConfidence'.lower(): arr = song.sectionsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SectionsStart'.lower(): arr = song.sectionsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsConfidence'.lower(): arr = song.segmentsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsLoudnessMax'.lower(): arr = song.segmentsLoudnessMax if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsLoudnessMaxTime'.lower(): arr = song.segmentsLoudnessMaxTime if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsLoudnessStart'.lower(): arr = song.segmentsLoudnessStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsStart'.lower(): arr = song.segmentsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SongHotttnesss'.lower(): hotttnesss = song.songHotttnesss if hotttnesss == 'nan': hotttnesss = 'NaN' csvRowString += hotttnesss elif attribute == 'StartOfFadeOut'.lower(): csvRowString += song.startOfFadeOut elif attribute == 'TatumsConfidence'.lower(): arr = song.tatumsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'TatumsStart'.lower(): arr = song.tatumsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'SegmentsPitches'.lower(): colmean = np.mean(song.segmentsPitches,axis=0) for m in colmean: csvRowString += str(m) + "," cov = np.dot(song.segmentsPitches.T,song.segmentsPitches) utriind = np.triu_indices(cov.shape[0]) feats = cov[utriind] for feat in feats: csvRowString += str(feat) + "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] elif attribute == 'SegmentsTimbre'.lower(): colmean = np.mean(song.segmentsTimbre,axis=0) for m in colmean: csvRowString += str(m) + "," cov = np.dot(song.segmentsTimbre.T,song.segmentsTimbre) utriind = np.triu_indices(cov.shape[0]) feats = cov[utriind] for feat in feats: csvRowString += str(feat) + "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'Decade'.lower(): yr = song.year if yr > 0: decade = song.year[:-1] + '0' else: decade = '0' csvRowString += decade elif attribute == 'ArtistMbtags'.lower(): tags = song.artistMbtags[1:-1] tags = "\"" + tags + "\"" tags = tags.replace("\n",'') csvRowString += tags tagsarray = shlex.split(tags) for t in tagsarray: csvLabelString += t + "," else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," ''' if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',',"") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',','') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," ''' #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" lastIndex = len(csvLabelString) csvLabelString = csvLabelString[0:lastIndex-1] csvLabelString += "\n" outputFile2.write(csvLabelString) csvLabelString = "" songH5File.close() outputFile1.close() outputFile2.close()
# title= hdf5_getters.get_title(h5,songidx=row)#.decode('UTF-8') # artist = "".join(c for c in unicodedata.normalize('NFD', str(artist.decode("utf8"))) if unicodedata.category(c) != "Mn") # title = "".join(c for c in unicodedata.normalize('NFD', str(title.decode("utf8"))) if unicodedata.category(c) != "Mn") #single number features danceability = hdf5_getters.get_danceability(h5, songidx=row) duration = hdf5_getters.get_duration(h5, songidx=row) energy = hdf5_getters.get_energy(h5, songidx=row) loudness = hdf5_getters.get_loudness(h5, songidx=row) musicalKey = hdf5_getters.get_key(h5, songidx=row) mode = hdf5_getters.get_mode(h5, songidx=row) tempo = hdf5_getters.get_tempo(h5, songidx=row) time_signature = hdf5_getters.get_time_signature(h5, songidx=row) year = hdf5_getters.get_year(h5, songidx=row) song_hottness = hdf5_getters.get_song_hotttnesss(h5, songidx=row) end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5, songidx=row) start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5, songidx=row) #timestamp features #take last element and divide by length to get beats/unit time, segments/unit_time bars_start = hdf5_getters.get_bars_start(h5, songidx=row) beats_start = hdf5_getters.get_beats_start(h5, songidx=row) sections_start = hdf5_getters.get_sections_start(h5, songidx=row) tatums_start = hdf5_getters.get_tatums_start(h5, songidx=row) segments_start = hdf5_getters.get_segments_start(h5, songidx=row) if len(bars_start) == 0: bars_start = 0. else: bars_start = bars_start[-1] / len(bars_start) if len(beats_start) == 0: beats_start = 0. else: beats_start = beats_start[-1] / len(beats_start) if len(sections_start) == 0: sections_start = 0. else: sections_start = sections_start[-1] / len(sections_start)
def main(): outputFile = open('songs.csv', 'w') writer = csv.writer(outputFile) csvRowString = "song_number,artist_familiarity,artist_hotttnesss,artist_id,artist_mbid,artist_playmeid,artist_7digitalid,artist_latitude,artist_longitude,artist_location,artist_name,release,release_7digitalid,song_id,song_hotttnesss,title,track_7digitalid,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,year" outputFile.write(csvRowString + "\n") csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "." # "." As the default means the current directory ext = ".H5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP songCount = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) values = [ songCount, hdf5_getters.get_artist_familiarity(songH5File), hdf5_getters.get_artist_hotttnesss(songH5File), hdf5_getters.get_artist_id(songH5File), hdf5_getters.get_artist_mbid(songH5File), hdf5_getters.get_artist_playmeid(songH5File), hdf5_getters.get_artist_7digitalid(songH5File), hdf5_getters.get_artist_latitude(songH5File), hdf5_getters.get_artist_longitude(songH5File), hdf5_getters.get_artist_location(songH5File), hdf5_getters.get_artist_name(songH5File), hdf5_getters.get_release(songH5File), hdf5_getters.get_release_7digitalid(songH5File), hdf5_getters.get_song_id(songH5File), hdf5_getters.get_song_hotttnesss(songH5File), hdf5_getters.get_title(songH5File), hdf5_getters.get_track_7digitalid(songH5File), hdf5_getters.get_analysis_sample_rate(songH5File), hdf5_getters.get_audio_md5(songH5File), hdf5_getters.get_danceability(songH5File), hdf5_getters.get_duration(songH5File), hdf5_getters.get_end_of_fade_in(songH5File), hdf5_getters.get_energy(songH5File), hdf5_getters.get_key(songH5File), hdf5_getters.get_key_confidence(songH5File), hdf5_getters.get_loudness(songH5File), hdf5_getters.get_mode(songH5File), hdf5_getters.get_mode_confidence(songH5File), hdf5_getters.get_start_of_fade_out(songH5File), hdf5_getters.get_tempo(songH5File), hdf5_getters.get_time_signature(songH5File), hdf5_getters.get_time_signature_confidence(songH5File), hdf5_getters.get_track_id(songH5File), hdf5_getters.get_year(songH5File) ] songH5File.close() songCount = songCount + 1 writer.writerow(values) outputFile.close()
artist_mbid = GETTERS.get_artist_mbid(h5, i) artist_mbtags = ','.join(str(e) for e in GETTERS.get_artist_mbtags(h5, i)) # array artist_mbtags_count = ','.join(str(e) for e in GETTERS.get_artist_mbtags_count(h5, i)) # array artist_name = GETTERS.get_artist_name(h5, i) artist_playmeid = GETTERS.get_artist_playmeid(h5, i) artist_terms = ','.join(str(e) for e in GETTERS.get_artist_terms(h5, i)) # array #artist_terms_freq = ','.join(str(e) for e in GETTERS.get_artist_terms_freq(h5, i)) # array #artist_terms_weight = ','.join(str(e) for e in GETTERS.get_artist_terms_weight(h5, i)) # array #audio_md5 = GETTERS.get_audio_md5(h5, i) #bars_confidence = ','.join(str(e) for e in GETTERS.get_bars_confidence(h5, i)) # array #bars_start = ','.join(str(e) for e in GETTERS.get_bars_start(h5, i)) # array #beats_confidence = ','.join(str(e) for e in GETTERS.get_beats_confidence(h5, i)) # array #beats_start = ','.join(str(e) for e in GETTERS.get_beats_start(h5, i)) # array danceability = GETTERS.get_danceability(h5, i) duration = GETTERS.get_duration(h5, i) end_of_fade_in = GETTERS.get_end_of_fade_in(h5, i) energy = GETTERS.get_energy(h5, i) key = GETTERS.get_key(h5, i) key_confidence = GETTERS.get_key_confidence(h5, i) loudness = GETTERS.get_loudness(h5, i) mode = GETTERS.get_mode(h5, i) mode_confidence = GETTERS.get_mode_confidence(h5, i) release = GETTERS.get_release(h5, i) release_7digitalid = GETTERS.get_release_7digitalid(h5, i) #sections_confidence = ','.join(str(e) for e in GETTERS.get_sections_confidence(h5, i)) # array #sections_start = ','.join(str(e) for e in GETTERS.get_sections_start(h5, i)) # array #segments_confidence = ','.join(str(e) for e in GETTERS.get_segments_confidence(h5, i)) # array #segments_loudness_max = ','.join(str(e) for e in GETTERS.get_segments_loudness_max(h5, i)) # array #segments_loudness_max_time = ','.join(str(e) for e in GETTERS.get_segments_loudness_max_time(h5, i)) # array #segments_loudness_start = ','.join(str(e) for e in GETTERS.get_segments_loudness_start(h5, i)) # array #segments_pitches = ','.join(str(e) for e in GETTERS.get_segments_pitches(h5, i)) # array
time_signature=[] end_of_fade_in=[] mode=[] start_of_fade_out=[] song_hotttnesss=[] for i in range(0,len(file)): h5 = yay.open_h5_file_read('F:\sem4\ml\project\MillionSongSubset\data\A\{}'.format(file[i])) duration.append(yay.get_duration(h5)) artist_familiarity.append(yay.get_artist_familiarity(h5)) artist_hotttnesss.append(yay.get_artist_hotttnesss(h5)) tempo.append(yay.get_tempo(h5)) loudness.append(yay.get_loudness(h5)) key.append(yay.get_key(h5)) time_signature.append(yay.get_time_signature(h5)) end_of_fade_in.append(yay.get_end_of_fade_in(h5)) mode.append(yay.get_mode(h5)) start_of_fade_out.append(yay.get_start_of_fade_out(h5)) song_hotttnesss.append(yay.get_song_hotttnesss(h5)) rows = zip(duration,artist_familiarity,artist_hotttnesss,tempo,loudness,key,time_signature, end_of_fade_in,mode,start_of_fade_out,song_hotttnesss) import csv with open('training_data.csv', "w", encoding="ISO-8859-1", newline='') as f: fieldnames = ['duration','artist_familiarity','artist_hotttnesss','tempo','loudness','key','time_signature', 'end_of_fade_in','mode','start_of_fade_out','song_hotttnesss'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer = csv.writer(f)
def hd5_single_random_file_parser(): # Open an h5 file in read mode h5 = hdf5_getters.open_h5_file_read( '/home/skalogerakis/Documents/MillionSong/MillionSongSubset/A/M/G/TRAMGDX12903CEF79F.h5' ) function_tracker = filter( lambda x: x.startswith('get'), hdf5_getters.__dict__.keys()) # Detects all the getter functions for f in function_tracker: # Print everything in function tracker print(f) # First effort to check what each field contains. print() # 55 available fields (exluding number of songs fields) print("Num of songs -- ", hdf5_getters.get_num_songs(h5)) # One song per file print("Title -- ", hdf5_getters.get_title(h5)) # Print the title of a specific h5 file print("Artist familiarity -- ", hdf5_getters.get_artist_familiarity(h5)) print("Artist hotness -- ", hdf5_getters.get_artist_hotttnesss(h5)) print("Artist ID -- ", hdf5_getters.get_artist_id(h5)) print("Artist mbID -- ", hdf5_getters.get_artist_mbid(h5)) print("Artist playmeid -- ", hdf5_getters.get_artist_playmeid(h5)) print("Artist 7DigitalID -- ", hdf5_getters.get_artist_7digitalid(h5)) print("Artist latitude -- ", hdf5_getters.get_artist_latitude(h5)) print("Artist longitude -- ", hdf5_getters.get_artist_longitude(h5)) print("Artist location -- ", hdf5_getters.get_artist_location(h5)) print("Artist Name -- ", hdf5_getters.get_artist_name(h5)) print("Release -- ", hdf5_getters.get_release(h5)) print("Release 7DigitalID -- ", hdf5_getters.get_release_7digitalid(h5)) print("Song ID -- ", hdf5_getters.get_song_id(h5)) print("Song Hotness -- ", hdf5_getters.get_song_hotttnesss(h5)) print("Track 7Digital -- ", hdf5_getters.get_track_7digitalid(h5)) print("Similar artists -- ", hdf5_getters.get_similar_artists(h5)) print("Artist terms -- ", hdf5_getters.get_artist_terms(h5)) print("Artist terms freq -- ", hdf5_getters.get_artist_terms_freq(h5)) print("Artist terms weight -- ", hdf5_getters.get_artist_terms_weight(h5)) print("Analysis sample rate -- ", hdf5_getters.get_analysis_sample_rate(h5)) print("Audio md5 -- ", hdf5_getters.get_audio_md5(h5)) print("Danceability -- ", hdf5_getters.get_danceability(h5)) print("Duration -- ", hdf5_getters.get_duration(h5)) print("End of Fade -- ", hdf5_getters.get_end_of_fade_in(h5)) print("Energy -- ", hdf5_getters.get_energy(h5)) print("Key -- ", hdf5_getters.get_key(h5)) print("Key Confidence -- ", hdf5_getters.get_key_confidence(h5)) print("Loudness -- ", hdf5_getters.get_loudness(h5)) print("Mode -- ", hdf5_getters.get_mode(h5)) print("Mode Confidence -- ", hdf5_getters.get_mode_confidence(h5)) print("Start of fade out -- ", hdf5_getters.get_start_of_fade_out(h5)) print("Tempo -- ", hdf5_getters.get_tempo(h5)) print("Time signature -- ", hdf5_getters.get_time_signature(h5)) print("Time signature confidence -- ", hdf5_getters.get_time_signature_confidence(h5)) print("Track ID -- ", hdf5_getters.get_track_id(h5)) print("Segments Start -- ", hdf5_getters.get_segments_start(h5)) print("Segments Confidence -- ", hdf5_getters.get_segments_confidence(h5)) print("Segments Pitches -- ", hdf5_getters.get_segments_pitches(h5)) print("Segments Timbre -- ", hdf5_getters.get_segments_timbre(h5)) print("Segments Loudness max -- ", hdf5_getters.get_segments_loudness_max(h5)) print("Segments Loudness max time-- ", hdf5_getters.get_segments_loudness_max_time(h5)) print("Segments Loudness start -- ", hdf5_getters.get_segments_loudness_start(h5)) print("Sections start -- ", hdf5_getters.get_sections_start(h5)) print("Sections Confidence -- ", hdf5_getters.get_sections_confidence(h5)) print("Beats start -- ", hdf5_getters.get_beats_start(h5)) print("Beats confidence -- ", hdf5_getters.get_beats_confidence(h5)) print("Bars start -- ", hdf5_getters.get_bars_start(h5)) print("Bars confidence -- ", hdf5_getters.get_bars_confidence(h5)) print("Tatums start -- ", hdf5_getters.get_tatums_start(h5)) print("Tatums confidence -- ", hdf5_getters.get_tatums_confidence(h5)) print("Artist mbtags -- ", hdf5_getters.get_artist_mbtags(h5)) print("Artist mbtags count -- ", hdf5_getters.get_artist_mbtags_count(h5)) print("Year -- ", hdf5_getters.get_year(h5)) fields = ['Title', 'Artist ID'] with open('Tester2.csv', 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile, delimiter=';') # writing the fields csv_writer.writerow(fields) # writing the data rows csv_writer.writerow( [hdf5_getters.get_title(h5), hdf5_getters.get_artist_id(h5)]) h5.close() # close h5 when completed in the end
def main(): outputFile1 = open('SongCSVFinal.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ( "SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation," + "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," + "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence," + "Title,Year,mbID,Energy,ArtistFamiliarity,Hotness,end_of_fade_in,key,keyConfidence,Loudness," + "mode,mode_confidence,start_of_fade_out") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() outputFile1.write("SongNumber,") outputFile1.write(csvRowString + "\n") csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "." # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #files = glob.glob(os.path.join(root,'*'+ext)) #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) print(files) for f in files: songH5File = hdf5_getters.open_h5_file_read(f) print('hello 1') print(songH5File) song = Song(str(hdf5_getters.get_song_id(songH5File))) testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) song.mbID = str(hdf5_getters.get_artist_mbid(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) #song.beatConfidence = str(hdf5_getters.get_beats_confidence(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.artistFamiliarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.keyConfidence = str( hdf5_getters.get_key_confidence(songH5File)) song.hotness = str(hdf5_getters.get_artist_hotttnesss(songH5File)) #song.sampleRate= str(hdf5_getters.get_analysis_sample_rate(songH5File)) song.end_of_fade_in = str( hdf5_getters.get_end_of_fade_in(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.mode_confidence = str( hdf5_getters.get_mode_confidence(songH5File)) song.start_of_fade_out = str( hdf5_getters.get_start_of_fade_out(songH5File)) #print song count csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',', "") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',', '') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'mbID'.lower(): csvRowString += song.mbID elif attribute == 'Energy'.lower(): csvRowString += song.energy elif attribute == 'ArtistFamiliarity'.lower(): csvRowString += song.artistFamiliarity elif attribute == 'Hotness'.lower(): csvRowString += song.hotness #elif attribute == 'SampleRate'.lower(): #csvRowString += song.sampleRate elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'key'.lower(): csvRowString += song.key elif attribute == 'keyConfidence'.lower(): csvRowString += song.keyConfidence elif attribute == 'Loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_confidence'.lower(): csvRowString += song.mode_confidence elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
# Get loudness loudness = hdf5_getters.get_loudness(h5) # Get year year = hdf5_getters.get_year(h5) # Get tempo tempo = hdf5_getters.get_tempo(h5) ######################################################### # Get analysis sample rate analysis_rate = hdf5_getters.get_analysis_sample_rate(h5) # Get end of fade in end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5) # Get key key = hdf5_getters.get_key(h5) # Get key confidence key_confidence = hdf5_getters.get_key_confidence(h5) # Get mode mode = hdf5_getters.get_mode(h5) # Get mode confidence mode_confidence = hdf5_getters.get_mode_confidence(h5) # Get start of fade-out start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5)
def convert_to_csv(): i = 0 data = [] target = [] count = 0 with open('data_timbre.csv', 'w+') as f: with open('target_timbre.csv', 'w+') as f2: writer = csv.writer(f) target_writer = csv.writer(f2) for root, dirs, files in os.walk(msd_subset_data_path): files = glob.glob(os.path.join(root,'*.h5')) for f in sorted(files): try: # Opening is very prone to causing exceptions, we'll just skip file if exception is thrown h5 = getter.open_h5_file_read(f) year = getter.get_year(h5) if year: count +=1 analysis_file = open('current_analysis_status.txt','a') update = "Currently at file name: " + str(f) + " and at number " + str(count) + "\n" analysis_file.write(update) print update analysis_file.close() target.append([year]) row = [] timbre = getter.get_segments_timbre(h5) segstarts = getter.get_segments_start(h5) btstarts = getter.get_beats_start(h5) duration = getter.get_duration(h5) end_of_fade_in = getter.get_end_of_fade_in(h5) key = getter.get_key(h5) key_confidence = getter.get_key_confidence(h5) loudness = getter.get_loudness(h5) start_of_fade_out = getter.get_start_of_fade_out(h5) tempo = getter.get_tempo(h5) time_signature = getter.get_time_signature(h5) time_signature_confidence = getter.get_time_signature_confidence(h5) h5.close() # VERY IMPORTANT segstarts = np.array(segstarts).flatten() btstarts = np.array(btstarts).flatten() bttimbre = align_feats(timbre.T, segstarts, btstarts, duration, end_of_fade_in, key, key_confidence, loudness, start_of_fade_out, tempo, time_signature, time_signature_confidence) if bttimbre is None: continue # Skip this track, some features broken npicks, winsize, finaldim = 12, 12, 144 # Calculated by 12 * 12. 12 is fixed as number of dimensions. processed_feats = extract_and_compress(bttimbre, npicks, winsize, finaldim) n_p_feats = processed_feats.shape[0] if processed_feats is None: continue # Skip this track, some features broken row = processed_feats.flatten() if len(row) != 12*144: # 12 dimensions * 144 features per dimension continue # Not enough features year_row = np.array([year]) if row.any() and year_row.any(): writer.writerow(row) target_writer.writerow(year_row) i+=1 else: h5.close() except Exception: pass print 'Finished!' analysis_file = open('current_analysis_status.txt','a') analysis_file.write('Done!') analysis_file.close() return
def complete_hd5_to_csv(basedir): ext = '.h5' # Get all files with extension .h5 # Header title. Essentially it is a schema for all the following songs header = [ 'Title', 'Artist familiarity', 'Artist hotness', 'Artist ID', 'Artist mbID', 'Artist playmeid', 'Artist 7DigitalID', 'Artist latitude', 'Artist longitude', 'Artist location', 'Artist Name', 'Release', 'Release 7DigitalID', 'Song ID', 'Song Hotness', 'Track 7Digital', 'Analysis sample rate', 'Audio md5', 'Danceability', 'Duration', 'End of Fade', 'Energy', 'Key', 'Key Confidence', 'Loudness', 'Mode', 'Mode Confidence', 'Start of fade out', 'Tempo', 'Time signature', 'Time signature confidence', 'Track ID', 'Year' ] with open('Tester2.csv', 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile, delimiter=';') # writing the header line. This line contains the schema of the data csv_writer.writerow(header) # Read all files from the given directories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) print(files) for f in files: h5 = hdf5_getters.open_h5_file_read(f) # Write as row all elements. NOTE: Only the serialized elements are parsed and not arrays csv_writer.writerow([ hdf5_getters.get_title(h5), hdf5_getters.get_artist_familiarity(h5), hdf5_getters.get_artist_hotttnesss(h5), hdf5_getters.get_artist_id(h5), hdf5_getters.get_artist_mbid(h5), hdf5_getters.get_artist_playmeid(h5), hdf5_getters.get_artist_7digitalid(h5), hdf5_getters.get_artist_latitude(h5), hdf5_getters.get_artist_longitude(h5), hdf5_getters.get_artist_location(h5), hdf5_getters.get_artist_name(h5), hdf5_getters.get_release(h5), hdf5_getters.get_release_7digitalid(h5), hdf5_getters.get_song_id(h5), hdf5_getters.get_song_hotttnesss(h5), hdf5_getters.get_track_7digitalid(h5), hdf5_getters.get_analysis_sample_rate(h5), hdf5_getters.get_audio_md5(h5), hdf5_getters.get_danceability(h5), hdf5_getters.get_duration(h5), hdf5_getters.get_end_of_fade_in(h5), hdf5_getters.get_energy(h5), hdf5_getters.get_key(h5), hdf5_getters.get_key_confidence(h5), hdf5_getters.get_loudness(h5), hdf5_getters.get_mode(h5), hdf5_getters.get_mode_confidence(h5), hdf5_getters.get_start_of_fade_out(h5), hdf5_getters.get_tempo(h5), hdf5_getters.get_time_signature(h5), hdf5_getters.get_time_signature_confidence(h5), hdf5_getters.get_track_id(h5), hdf5_getters.get_year(h5) ]) # For debugging purposes. Everything as expected # print() # print("Num of songs -- ", hdf5_getters.get_num_songs(h5)) # One song per file # print("Title -- ", hdf5_getters.get_title(h5)) # Print the title of a specific h5 file # print("Artist familiarity -- ", hdf5_getters.get_artist_familiarity(h5)) # print("Artist hotness -- ", hdf5_getters.get_artist_hotttnesss(h5)) # print("Artist ID -- ", hdf5_getters.get_artist_id(h5)) # print("Artist mbID -- ", hdf5_getters.get_artist_mbid(h5)) # print("Artist playmeid -- ", hdf5_getters.get_artist_playmeid(h5)) # print("Artist 7DigitalID -- ", hdf5_getters.get_artist_7digitalid(h5)) # print("Artist latitude -- ", hdf5_getters.get_artist_latitude(h5)) # print("Artist longitude -- ", hdf5_getters.get_artist_longitude(h5)) # print("Artist location -- ", hdf5_getters.get_artist_location(h5)) # print("Artist Name -- ", hdf5_getters.get_artist_name(h5)) # print("Release -- ", hdf5_getters.get_release(h5)) # print("Release 7DigitalID -- ", hdf5_getters.get_release_7digitalid(h5)) # print("Song ID -- ", hdf5_getters.get_song_id(h5)) # print("Song Hotness -- ", hdf5_getters.get_song_hotttnesss(h5)) # print("Track 7Digital -- ", hdf5_getters.get_track_7digitalid(h5)) # print("Analysis sample rate -- ", hdf5_getters.get_analysis_sample_rate(h5)) # print("Audio md5 -- ", hdf5_getters.get_audio_md5(h5)) # print("Danceability -- ", hdf5_getters.get_danceability(h5)) # print("Duration -- ", hdf5_getters.get_duration(h5)) # print("End of Fade -- ", hdf5_getters.get_end_of_fade_in(h5)) # print("Energy -- ", hdf5_getters.get_energy(h5)) # print("Key -- ", hdf5_getters.get_key(h5)) # print("Key Confidence -- ", hdf5_getters.get_key_confidence(h5)) # print("Loudness -- ", hdf5_getters.get_loudness(h5)) # print("Mode -- ", hdf5_getters.get_mode(h5)) # print("Mode Confidence -- ", hdf5_getters.get_mode_confidence(h5)) # print("Start of fade out -- ", hdf5_getters.get_start_of_fade_out(h5)) # print("Tempo -- ", hdf5_getters.get_tempo(h5)) # print("Time signature -- ", hdf5_getters.get_time_signature(h5)) # print("Time signature confidence -- ", hdf5_getters.get_time_signature_confidence(h5)) # print("Track ID -- ", hdf5_getters.get_track_id(h5)) # # print("Artist mbtags -- ", hdf5_getters.get_artist_mbtags(h5)) # # print("Artist mbtags count -- ", hdf5_getters.get_artist_mbtags_count(h5)) # print("Year -- ", hdf5_getters.get_year(h5)) h5.close()
hdf5path = "MillionSongSubset/data/"+f1+"/"+f2+"/"+f3+"/"+track_id+".h5" total_tracks += 1 # Check existence of the HDF5 file if not os.path.isfile(hdf5path): continue #print('ERROR: file',hdf5path,'does not exist.') #sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) # Retrieve features from HDF5 danceability = hdf5_getters.get_danceability(h5) duration = hdf5_getters.get_duration(h5) time_of_fade_in = hdf5_getters.get_end_of_fade_in(h5) energy = hdf5_getters.get_energy(h5) key = hdf5_getters.get_key(h5) key_confidence = hdf5_getters.get_key_confidence(h5) loudness = hdf5_getters.get_loudness(h5) mode = hdf5_getters.get_mode(h5) mode_confidence = hdf5_getters.get_mode_confidence(h5) sections_start = hdf5_getters.get_sections_start(h5) num_sections = len(sections_start) if num_sections == 0: h5.close() continue segments_loudness_max = hdf5_getters.get_segments_loudness_max(h5) segments_loudness_start = hdf5_getters.get_segments_loudness_start(h5) num_segments = len(hdf5_getters.get_segments_start(h5)) num_tatums = len(hdf5_getters.get_tatums_start(h5))
def classify(h5): output_array = {} # duration duration = hdf5_getters.get_duration(h5) output_array["duration"] = duration ### ADDED VALUE TO ARRAY # number of bars bars = hdf5_getters.get_bars_start(h5) num_bars = len(bars) output_array["num_bars"] = num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length = numpy.ediff1d(bars) variance_bar_length = numpy.var(bar_length) output_array[ "variance_bar_length"] = variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats = hdf5_getters.get_beats_start(h5) num_beats = len(beats) output_array["num_beats"] = num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length = numpy.ediff1d(beats) variance_beats_length = numpy.var(bar_length) output_array[ "variance_beats_length"] = variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability = hdf5_getters.get_danceability(h5) output_array["danceability"] = danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"] = end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy = hdf5_getters.get_energy(h5) output_array["energy"] = energy ### ADDED VALUE TO ARRAY # key key = hdf5_getters.get_key(h5) output_array["key"] = int(key) ### ADDED VALUE TO ARRAY # loudness loudness = hdf5_getters.get_loudness(h5) output_array["loudness"] = loudness ### ADDED VALUE TO ARRAY # mode mode = hdf5_getters.get_mode(h5) output_array["mode"] = int(mode) ### ADDED VALUE TO ARRAY # number sections sections = hdf5_getters.get_sections_start(h5) num_sections = len(sections) output_array["num_sections"] = num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length = numpy.ediff1d(sections) variance_sections_length = numpy.var(sections) output_array[ "variance_sections_length"] = variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments = hdf5_getters.get_segments_start(h5) num_segments = len(segments) output_array["num_segments"] = num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length = numpy.ediff1d(segments) variance_segments_length = numpy.var(segments) output_array[ "variance_segments_length"] = variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array = hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array = hdf5_getters.get_segments_loudness_max_time( h5) segment_loudness_max_index = 0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i] > segment_loudness_max_array[ segment_loudness_max_index]: segment_loudness_max_index = i segment_loudness_max = segment_loudness_max_array[ segment_loudness_max_index] segment_loudness_max_time = segment_loudness_max_time_array[ segment_loudness_max_index] output_array[ "segment_loudness_max"] = segment_loudness_max ### ADDED VALUE TO ARRAY output_array[ "segment_loudness_time"] = segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array = hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean = numpy.mean(segment_loudness_array) output_array[ "segment_loudness_mean"] = segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance = numpy.var(segment_loudness_array) output_array[ "segment_loudness_variance"] = segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array = hdf5_getters.get_segments_pitches(h5) segment_pitches_mean = numpy.mean(segment_pitches_array, axis=0).tolist() output_array["segment_pitches_mean"] = segment_pitches_mean # segment pitches variance (start) segment_pitches_variance = numpy.var(segment_pitches_array, axis=0).tolist() output_array["segment_pitches_variance"] = segment_pitches_variance # segment timbres segment_timbres_array = hdf5_getters.get_segments_timbre(h5) segment_timbres_mean = numpy.mean(segment_timbres_array, axis=0).tolist() output_array["segment_timbres_mean"] = segment_timbres_mean # segment timbres variance (start) segment_timbres_variance = numpy.var(segment_timbres_array, axis=0).tolist() output_array["segment_timbres_variance"] = segment_timbres_variance # hotttnesss hottness = hdf5_getters.get_song_hotttnesss(h5, 0) output_array["hottness"] = hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5) fade_out = duration - start_of_fade_out output_array["fade_out"] = fade_out ### ADDED VALUE TO ARRAY # tatums tatums = hdf5_getters.get_tatums_start(h5) num_tatums = len(tatums) output_array["num_tatums"] = num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length = numpy.ediff1d(tatums) variance_tatums_length = numpy.var(tatums_length) output_array[ "variance_tatums_length"] = variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo = hdf5_getters.get_tempo(h5) output_array["tempo"] = tempo ### ADDED VALUE TO ARRAY # time signature time_signature = hdf5_getters.get_time_signature(h5) output_array["time_signature"] = int( time_signature) ### ADDED VALUE TO ARRAY # year year = hdf5_getters.get_year(h5) output_array["year"] = int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms = hdf5_getters.get_artist_terms(h5, 0) output_array["artist_terms"] = artist_terms.tolist() artist_terms_freq = hdf5_getters.get_artist_terms_freq(h5, 0) output_array["artist_terms_freq"] = artist_terms_freq.tolist() artist_name = hdf5_getters.get_artist_name(h5, 0) output_array["artist_name"] = artist_name artist_id = hdf5_getters.get_artist_id(h5, 0) output_array["artist_id"] = artist_id # title title = hdf5_getters.get_title(h5, 0) output_array["title"] = title return output_array
def data_to_flat_file(basedir, ext='.h5'): """This function extract the information from the tables and creates the flat file.""" count = 0 #song counter list_to_write = [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title = title.replace('"', '') comma = title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album = album.replace('"', '') #eliminating commas in the album comma = album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma = artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name = artist_name.replace('"', '') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam = -1 artist_hotness = hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness = -1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat = -1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,") if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon = -1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability = -1 end_fade_in = hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in = -1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy = -1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c = -1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness = -1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf = -1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot = -1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo = -1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c = -1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg = get_avg(bars_c) bars_c_max = get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev = get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max = get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev = get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg = get_avg(beats_c) beats_c_max = get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev = get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max = get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev = get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg = get_avg(sec_c) sec_c_max = get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev = get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max = get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev = get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg = get_avg(seg_c) seg_c_max = get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev = get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg = get_avg(seg_loud_max) seg_loud_max_max = get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev = get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg = get_avg(seg_loud_max_time) seg_loud_max_time_max = get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev = get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg = get_avg(seg_loud_start) seg_loud_start_max = get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev = get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg = get_avg(seg_start) seg_start_max = get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev = get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg = get_avg(tatms_c) tatms_c_max = get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev = get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg = get_avg(tatms_start) tatms_start_max = get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev = get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes = get_genre_indexes( trm_freq) #index of the highest freq final_genre = [] genres_so_far = [] for i in range(len(genre_indexes)): genre_tmp = get_genre( art_trm, genre_indexes[i] ) #genre that corresponds to the highest freq genres_so_far = genre_dict.get_genre_in_dict( genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set = 1 #genre was found in dictionary if genre_set == 1: col_num = [] for genre in final_genre: column = int( genre) #getting the column number of the genre col_num.append(column) genre_array = genre_columns(col_num) #genre array else: genre_array = genre_columns( -1) #the genre was not found in the dictionary transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg = [] seg_pitch_max = [] seg_pitch_min = [] seg_pitch_stddev = [] seg_pitch_count = [] seg_pitch_sum = [] i = 0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i = i + 1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg = [] seg_timbre_max = [] seg_timbre_min = [] seg_timbre_stddev = [] seg_timbre_count = [] seg_timbre_sum = [] i = 0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i = i + 1 #Writing to the flat file writer.writerow([ title, album, artist_name, year, duration, seg_start_count, tempo ]) h5.close() count = count + 1 print count
def fill_attributes(song, songH5File): #----------------------------non array attributes------------------------------- song.analysisSampleRate = str( hdf5_getters.get_analysis_sample_rate(songH5File)) song.artistDigitalID = str(hdf5_getters.get_artist_7digitalid(songH5File)) song.artistFamiliarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.artistHotness = str(hdf5_getters.get_artist_hottness(songH5File)) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str(hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File)) song.artistmbID = str(hdf5_getters.get_artist_mbid(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.artistPlayMeID = str(hdf5_getters.get_artist_playmeid(songH5File)) song.audioMD5 = str(hdf5_getters.get_audio_md5(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) song.endOfFadeIn = str(hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.keyConfidence = str(hdf5_getters.get_key_confidence(songH5File)) song.segementsConfidence = str( hdf5_getters.get_segments_confidence(songH5File)) song.segementsConfidence = str( hdf5_getters.get_sections_confidence(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.modeConfidence = str(hdf5_getters.get_mode_confidence(songH5File)) song.release = str(hdf5_getters.get_release(songH5File)) song.releaseDigitalID = str( hdf5_getters.get_release_7digitalid(songH5File)) song.songHotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File)) song.startOfFadeOut = str(hdf5_getters.get_start_of_fade_out(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str(hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.trackID = str(hdf5_getters.get_track_id(songH5File)) song.trackDigitalID = str(hdf5_getters.get_track_7digitalid(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) #-------------------------------array attributes-------------------------------------- #array float song.beatsStart_mean, song.beatsStart_var = convert_array_to_meanvar( hdf5_getters.get_beats_start(songH5File)) #array float song.artistTermsFreq_mean, song.artistTermsFreq_var = convert_array_to_meanvar( hdf5_getters.get_artist_terms_freq(songH5File)) #array float song.artistTermsWeight_mean, song.artistTermsWeight_var = convert_array_to_meanvar( hdf5_getters.get_artist_terms_weight(songH5File)) #array int song.artistmbTagsCount_mean, song.artistmbTagsCount_var = convert_array_to_meanvar( hdf5_getters.get_artist_mbtags_count(songH5File)) #array float song.barsConfidence_mean, song.barsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_bars_confidence(songH5File)) #array float song.barsStart_mean, song.barsStart_var = convert_array_to_meanvar( hdf5_getters.get_bars_start(songH5File)) #array float song.beatsConfidence_mean, song.beatsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_beats_confidence(songH5File)) #array float song.sectionsConfidence_mean, song.sectionsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_sections_confidence(songH5File)) #array float song.sectionsStart_mean, song.sectionsStart_var = convert_array_to_meanvar( hdf5_getters.get_sections_start(songH5File)) #array float song.segmentsConfidence_mean, song.segmentsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_segments_confidence(songH5File)) #array float song.segmentsLoudness_mean, song.segmentsLoudness_var = convert_array_to_meanvar( hdf5_getters.get_segments_loudness_max(songH5File)) #array float song.segmentsLoudnessMaxTime_mean, song.segmentsLoudnessMaxTime_var = convert_array_to_meanvar( hdf5_getters.get_segments_loudness_max_time(songH5File)) #array float song.segmentsLoudnessMaxStart_mean, song.segmentsLoudnessMaxStart_var = convert_array_to_meanvar( hdf5_getters.get_segments_loudness_start(songH5File)) #array float song.segmentsStart_mean, song.segmentsStart_var = convert_array_to_meanvar( hdf5_getters.get_segments_start(songH5File)) #array float song.tatumsConfidence_mean, song.tatumsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_tatums_confidence(songH5File)) #array float song.tatumsStart_mean, song.tatumsStart_var = convert_array_to_meanvar( hdf5_getters.get_tatums_start(songH5File)) #array2d float song.segmentsTimbre_mean, song.segmentsTimbre_var = covert_2darray_to_meanvar( hdf5_getters.get_segments_timbre(songH5File)) #array2d float song.segmentsPitches_mean, song.segmentsPitches_var = covert_2darray_to_meanvar( hdf5_getters.get_segments_pitches(songH5File)) #------------------------array string attributes------------------------ song.similarArtists = convert_array_to_string( hdf5_getters.get_similar_artists(songH5File)) #array string song.artistTerms = convert_array_to_string( hdf5_getters.get_artist_terms(songH5File)) #array string song.artistmbTags = convert_array_to_string( hdf5_getters.get_artist_mbtags(songH5File)) #array string return song
def main(): dataset_dir = sys.argv[1] global feat Create_BoW(dataset_dir) Size_BoW = Index_BoW(Bag_Words) count = Frequency(Size_BoW, dataset_dir) Size_BoW = Prune(count) Lablify() print "Forming Dataset..." listing1 = os.listdir(dataset_dir) for a in listing1: listing2 = os.listdir(dataset_dir+a+'/') for b in listing2: listing3 = os.listdir(dataset_dir+a+'/'+b+'/') for c in listing3: listing4 = os.listdir(dataset_dir+a+'/'+b+'/'+c+'/') for d in listing4: h5 = hdf5_getters.open_h5_file_read(dataset_dir+a+'/'+b+'/'+c+'/'+d) feat = [] temp = hdf5_getters.get_artist_hotttnesss(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue feat.append(temp) temp = hdf5_getters.get_artist_familiarity(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue feat.append(temp) temp = hdf5_getters.get_bars_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_beats_confidence(h5) if temp.size == 0: h5.close() continue mm = np.mean(temp) vv = np.var(temp) if mm==0.0 and vv==0.0: h5.close() continue feat.append(mm) feat.append(vv) feat.append(hdf5_getters.get_duration(h5)) temp = hdf5_getters.get_end_of_fade_in(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) feat.append(hdf5_getters.get_key(h5)) temp = hdf5_getters.get_key_confidence(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_loudness(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) feat.append(hdf5_getters.get_mode(h5)) temp = hdf5_getters.get_mode_confidence(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_sections_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_loudness_max(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_loudness_max_time(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_pitches(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_segments_timbre(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_start_of_fade_out(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_tatums_confidence(h5) if temp.size == 0: h5.close() continue MeanVar(temp) temp = hdf5_getters.get_tempo(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) feat.append(hdf5_getters.get_time_signature(h5)) temp = hdf5_getters.get_time_signature_confidence(h5) if (math.isnan(temp)): h5.close() continue feat.append(temp) temp = hdf5_getters.get_year(h5) if temp == 0: h5.close() continue feat.append(temp) temp = hdf5_getters.get_artist_terms(h5) if temp.size == 0: h5.close() continue temp_ = hdf5_getters.get_artist_terms_weight(h5) if temp_.size == 0: continue for j in Final_BoW: if j in temp: x = np.where(temp==j) x = x[0][0] feat.append(temp_[x]) else: x = 0.0 feat.append(x) temp = hdf5_getters.get_song_hotttnesss(h5) if (math.isnan(temp) or temp==0.0): h5.close() continue hott = 0 if temp >=0.75: hott = 1 elif temp >=0.40 and temp <0.75: hott = 2 else: hott = 3 feat.append(hott) h5.close() count = 1 f=open('MSD_DATASET.txt', 'a') outstring='' cnt = 0 feat_size = len(feat) for i in feat: cnt+=1 outstring+=str(i) if (cnt!=feat_size): outstring+=',' outstring+='\n' f.write(outstring) f.close()
def getInfo(files): data = [] build_str = '' with open(sys.argv[1], 'r') as f: contents = f.read() c = contents.split() f.close() print("creating csv with following fields:" + contents) for i in c: build_str = build_str + i + ',' build_str = build_str[:-1] build_str = build_str + '\n' for fil in files: curFile = getters.open_h5_file_read(fil) d2 = {} get_table = {'track_id': getters.get_track_id(curFile), 'segments_pitches': getters.get_segments_pitches(curFile), 'time_signature_confidence': getters.get_time_signature_confidence(curFile), 'song_hotttnesss': getters.get_song_hotttnesss(curFile), 'artist_longitude': getters.get_artist_longitude(curFile), 'tatums_confidence': getters.get_tatums_confidence(curFile), 'num_songs': getters.get_num_songs(curFile), 'duration': getters.get_duration(curFile), 'start_of_fade_out': getters.get_start_of_fade_out(curFile), 'artist_name': getters.get_artist_name(curFile), 'similar_artists': getters.get_similar_artists(curFile), 'artist_mbtags': getters.get_artist_mbtags(curFile), 'artist_terms_freq': getters.get_artist_terms_freq(curFile), 'release': getters.get_release(curFile), 'song_id': getters.get_song_id(curFile), 'track_7digitalid': getters.get_track_7digitalid(curFile), 'title': getters.get_title(curFile), 'artist_latitude': getters.get_artist_latitude(curFile), 'energy': getters.get_energy(curFile), 'key': getters.get_key(curFile), 'release_7digitalid': getters.get_release_7digitalid(curFile), 'artist_mbid': getters.get_artist_mbid(curFile), 'segments_confidence': getters.get_segments_confidence(curFile), 'artist_hotttnesss': getters.get_artist_hotttnesss(curFile), 'time_signature': getters.get_time_signature(curFile), 'segments_loudness_max_time': getters.get_segments_loudness_max_time(curFile), 'mode': getters.get_mode(curFile), 'segments_loudness_start': getters.get_segments_loudness_start(curFile), 'tempo': getters.get_tempo(curFile), 'key_confidence': getters.get_key_confidence(curFile), 'analysis_sample_rate': getters.get_analysis_sample_rate(curFile), 'bars_confidence': getters.get_bars_confidence(curFile), 'artist_playmeid': getters.get_artist_playmeid(curFile), 'artist_terms_weight': getters.get_artist_terms_weight(curFile), 'segments_start': getters.get_segments_start(curFile), 'artist_location': getters.get_artist_location(curFile), 'loudness': getters.get_loudness(curFile), 'year': getters.get_year(curFile), 'artist_7digitalid': getters.get_artist_7digitalid(curFile), 'audio_md5': getters.get_audio_md5(curFile), 'segments_timbre': getters.get_segments_timbre(curFile), 'mode_confidence': getters.get_mode_confidence(curFile), 'end_of_fade_in': getters.get_end_of_fade_in(curFile), 'danceability': getters.get_danceability(curFile), 'artist_familiarity': getters.get_artist_familiarity(curFile), 'artist_mbtags_count': getters.get_artist_mbtags_count(curFile), 'tatums_start': getters.get_tatums_start(curFile), 'artist_id': getters.get_artist_id(curFile), 'segments_loudness_max': getters.get_segments_loudness_max(curFile), 'bars_start': getters.get_bars_start(curFile), 'beats_start': getters.get_beats_start(curFile), 'artist_terms': getters.get_artist_terms(curFile), 'sections_start': getters.get_sections_start(curFile), 'beats_confidence': getters.get_beats_confidence(curFile), 'sections_confidence': getters.get_sections_confidence(curFile)} tid = fil.split('/')[-1].split('.')[0] # print(c) for i in c: if i in get_table: d2[i] = get_table[i] d2[i] = str(d2[i]).replace('\n','') build_str = build_str + d2[i] + ',' else: print('error: unspecified field') exit(0) build_str = build_str[:-1] # print(build_str[:-1]) build_str = build_str + '\n' curFile.close() build_str = build_str.replace('b','').replace("'",'').replace('"','') return (build_str)