def getSongProperties(songCount = 3000, splitData = True): songDict = {} songIdDict = {} songIdCount = 0 for root, dirs, files in os.walk(msd_subset_data_path): files = glob.glob(os.path.join(root,'*.h5')) for f in files: h5 = GETTERS.open_h5_file_read(f) tempo = GETTERS.get_tempo(h5) danceability = GETTERS.get_danceability(h5) energy = GETTERS.get_energy(h5) loudness = GETTERS.get_loudness(h5) #print GETTERS.get_artist_terms(h5) timbre = GETTERS.get_segments_timbre(h5) artist_hotness = GETTERS.get_artist_hotttnesss(h5) song_key = GETTERS.get_key(h5) songIdDict[GETTERS.get_song_id(h5)] = songIdCount songDict[songIdCount] = [tempo,danceability,energy,loudness,artist_hotness,song_key] songIdCount += 1 h5.close() #if len(songDict) >2: # break #if len(songDict) >2: # break if songIdCount > songCount and splitData: break return songIdDict,songDict
def extract_features(filename): h5 = hdf5_getters.open_h5_file_read(filename) f = [None] * len(features) f[features.index('track_id')] = hdf5_getters.get_track_id(h5, 0).item() f[features.index('song_id')] = hdf5_getters.get_song_id(h5, 0).item() f[features.index('hotttnesss')] = hdf5_getters.get_artist_hotttnesss( h5, 0).item() f[features.index('danceability')] = hdf5_getters.get_danceability( h5, 0).item() f[features.index('duration')] = hdf5_getters.get_duration(h5, 0).item() f[features.index('key')] = hdf5_getters.get_key(h5, 0).item() f[features.index('energy')] = hdf5_getters.get_energy(h5, 0).item() f[features.index('loudness')] = hdf5_getters.get_loudness(h5, 0).item() f[features.index('year')] = hdf5_getters.get_year(h5, 0).item() f[features.index('time_signature')] = hdf5_getters.get_time_signature( h5, 0).item() f[features.index('tempo')] = hdf5_getters.get_tempo(h5, 0).item() tags = '' for tag in hdf5_getters.get_artist_terms(h5): tags += ('%s|' % tag) # Remove trailing pipe. tags = tags[:len(tags) - 1] f[features.index('tags')] = tags h5.close() return f
def read(self, path): files = os.listdir(path) import csv with open('library_csv.csv', 'w') as library_csv: writer = csv.writer(library_csv) writer.writerow([ 'Loudness', 'Danceability', 'Energy', 'Tempo', 'timeSignature', 'Title' ]) # get params for filename in files: # hdf5path = filename hdf5path = "Data/" + filename # hdf5path.replace("'","",2) # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file', hdf5path, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) # get all getters loudness = hdf5.get_loudness(h5) dance = hdf5.get_danceability(h5) energy = hdf5.get_energy(h5) tempo = hdf5.get_tempo(h5) ts = hdf5.get_time_signature(h5) title = hdf5.get_title(h5) writer.writerow([loudness, dance, energy, tempo, ts, title]) # print them h5.close() library_csv.close()
def func_to_extract_features(filename): """ This function extracts all features: per-track, per-section and per-segment """ # - open the song file h5 = GETTERS.open_h5_file_read(filename) # - get per-track features and put them artist_id = GETTERS.get_artist_id(h5) song_id = GETTERS.get_song_id(h5) artist_familiarity = GETTERS.get_artist_familiarity(h5) artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5) artist_latitude = GETTERS.get_artist_latitude(h5) artist_longitude = GETTERS.get_artist_longitude(h5) danceability = GETTERS.get_danceability(h5) energy = GETTERS.get_energy(h5) loudness = GETTERS.get_loudness(h5) song_hotttnesss = GETTERS.get_song_hotttnesss(h5) tempo = GETTERS.get_tempo(h5) year = GETTERS.get_year(h5) # artist_ids.add(artist_id) # features_tuple = (artist_id, artist_familiarity, artist_hotttnesss, artist_latitude, artist_longitude, danceability, energy, loudness, song_hotttnesss, tempo, year) features_tuple = (artist_id, artist_familiarity, artist_hotttnesss, loudness, song_hotttnesss, tempo, year) # print features_tuple features_tuples[song_id] = features_tuple # files_per_artist[artist_id] += 1 # - close the file h5.close()
def func_to_desired_song_data(filename): h5 = GETTERS.open_h5_file_read(filename) track_id = GETTERS.get_track_id(h5) for song in random_songs: if song[0] == track_id: print("FOUND ONE!") title = replace_characters(GETTERS.get_title(h5)) artist = replace_characters(GETTERS.get_artist_name(h5)) year = GETTERS.get_year(h5) tempo = GETTERS.get_tempo(h5) key = GETTERS.get_key(h5) loudness = GETTERS.get_loudness(h5) energy = GETTERS.get_energy(h5) danceability = GETTERS.get_danceability(h5) time_signature = GETTERS.get_time_signature(h5) mode = GETTERS.get_mode(h5) hotttness = GETTERS.get_song_hotttnesss(h5) song_data = { 'title': title, 'artist': artist, 'year': year, 'tempo': tempo, 'key': key, 'loudness': loudness, 'energy': energy, 'danceability': danceability, 'time_signature': time_signature, 'mode': mode, 'hotttness': hotttness } all_the_data.append(song_data) h5.close()
def parse_songs(directory): global count global MAX_SONGS for filename in os.listdir(directory): if count >= MAX_SONGS: return file_path = os.path.join(directory, filename) if os.path.isdir(file_path): parse_songs(file_path) else: count += 1 if count % 100 == 0: print('Parsed ' + str(count) + ' songs') with hdf5_getters.open_h5_file_read(file_path) as h5: for i in range(hdf5_getters.get_num_songs(h5)): title = hdf5_getters.get_title(h5, i).decode('UTF-8') year = hdf5_getters.get_year(h5, i).item() danceability = hdf5_getters.get_danceability(h5, i).item() tags = hdf5_getters.get_artist_mbtags(h5, i).tolist() genres = [tag.decode('UTF-8') for tag in tags] tempo = hdf5_getters.get_tempo(h5, i).item() song = { 'title': title, 'year': year, 'danceability': danceability, 'genres': genres, 'tempo': tempo } song = os.path.splitext(filename) with open( "/home/ubuntu/million_songs/parsed_data/" + song[0] + '.json', 'w') as fp: json.dump(song, fp)
def get_attribute(files): array = [] count = 0 for f in files: temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp.append(hdf5_getters.get_title(h5)) temp.append(hdf5_getters.get_artist_name(h5)) temp = np.nan_to_num(temp) array.append(temp) # if count%100 ==0: # print(array[count-100:count-1]) # kmean.fit(array[count-100:count-1]) h5.close() return array
def h5_to_csv_fields(h5,song): '''Converts h5 format to text Inputs: h5, an h5 file object, usable with the wrapper code MSongsDB song, an integer, representing which song in the h5 file to take the info out of (h5 files contain many songs) Output: a string representing all the information of this song, as a single line of a csv file ''' rv=[] ##All these are regular getter functions from wrapper code rv.append(gt.get_artist_name(h5,song)) rv.append(gt.get_title(h5, song)) rv.append(gt.get_release(h5, song)) rv.append(gt.get_year(h5,song)) rv.append(gt.get_duration(h5,song)) rv.append(gt.get_artist_familiarity(h5,song)) rv.append(gt.get_artist_hotttnesss(h5,song)) rv.append(gt.get_song_hotttnesss(h5, song)) ##artist_terms, artist_terms_freq, and artist_terms_weight getter functions ##are all arrays, so we need to turn them into strings first. We used '_' as a separator rv.append(array_to_csv_field(list(gt.get_artist_terms(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_freq(h5,song)))) rv.append(array_to_csv_field(list(gt.get_artist_terms_weight(h5,song)))) rv.append(gt.get_mode(h5,song)) rv.append(gt.get_key(h5,song)) rv.append(gt.get_tempo(h5,song)) rv.append(gt.get_loudness(h5,song)) rv.append(gt.get_danceability(h5,song)) rv.append(gt.get_energy(h5,song)) rv.append(gt.get_time_signature(h5,song)) rv.append(array_to_csv_field(list(gt.get_segments_start(h5,song)))) ##These arrays have vectors (Arrays) as items, 12 dimensional each ##An array like [[1,2,3],[4,5,6]] will be written to csv as '1;2;3_4;5;6', i.e. there's two types of separators rv.append(double_Array_to_csv_field(list(gt.get_segments_timbre(h5,song)),'_',';')) rv.append(double_Array_to_csv_field(list(gt.get_segments_pitches(h5,song)),'_',';')) rv.append(array_to_csv_field(list(gt.get_segments_loudness_start(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max(h5,song)))) rv.append(array_to_csv_field(list(gt.get_segments_loudness_max_time(h5,song)))) rv.append(array_to_csv_field(list(gt.get_sections_start(h5,song)))) ##turn this list into a string with comma separators (i.e. a csv line) rv_string=array_to_csv_field(rv, ",") rv_string+="\n" return rv_string
def get_attribute(f): temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp = np.nan_to_num(temp) array.append(temp) h5.close()
def get_all_data(target, basedir, ext='.h5') : # header target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( "track_id", "song_id", "title", "artist_name", "artist_location", "artist_hotttnesss", "release", "year", "song_hotttnesss", "danceability", "duration", "loudness", "sample_rate", "tempo" )) count = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: for line in f: new_file = open("tmp.txt", 'w') new_file.write(line) h5 = hdf5_getters.open_h5_file_read(new_file) target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( hdf5_getters.get_track_id(h5), hdf5_getters.get_song_id(h5), hdf5_getters.get_title(h5), hdf5_getters.get_artist_name(h5), hdf5_getters.get_artist_location(h5), hdf5_getters.get_artist_hotttnesss(h5), hdf5_getters.get_release(h5), hdf5_getters.get_year(h5), hdf5_getters.get_song_hotttnesss(h5), hdf5_getters.get_danceability(h5), hdf5_getters.get_duration(h5), hdf5_getters.get_loudness(h5), hdf5_getters.get_analysis_sample_rate(h5), hdf5_getters.get_tempo(h5) )) # show progress count += 1 print "%d/10000" % (count) h5.close()
def get_all_attributes(filename): """ This function does 3 simple things: - open the song file - get all required attributes - write it to a csv file - close the files """ with open('attributes.csv', 'a') as csvfile: try: # let's apply the previous function to all files csvwriter = csv.writer(csvfile, delimiter='\t') h5 = GETTERS.open_h5_file_read(filename) RESULTS = [] RESULTS.append(GETTERS.get_year(h5)) RESULTS.append(GETTERS.get_artist_id(h5)) RESULTS.append(GETTERS.get_artist_name(h5)) RESULTS.append(GETTERS.get_artist_mbid(h5)) RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5))) RESULTS.append(GETTERS.get_artist_hotttnesss(h5)) RESULTS.append(GETTERS.get_artist_latitude(h5)) RESULTS.append(GETTERS.get_artist_longitude(h5)) RESULTS.append(GETTERS.get_artist_familiarity(h5)) RESULTS.append(GETTERS.get_danceability(h5)) RESULTS.append(GETTERS.get_duration(h5)) RESULTS.append(GETTERS.get_energy(h5)) RESULTS.append(GETTERS.get_loudness(h5)) RESULTS.append(GETTERS.get_song_hotttnesss(h5)) RESULTS.append(GETTERS.get_song_id(h5)) RESULTS.append(GETTERS.get_tempo(h5)) RESULTS.append(GETTERS.get_time_signature(h5)) RESULTS.append(GETTERS.get_title(h5)) RESULTS.append(GETTERS.get_track_id(h5)) RESULTS.append(GETTERS.get_release(h5)) csvwriter.writerow(RESULTS) h5.close() except AttributeError: pass
def get_feats(h5): f = [] f.append(hdf5_getters.get_artist_name(h5).decode('utf8').replace(',', '')) f.append(hdf5_getters.get_title(h5).decode('utf8').replace(',', '')) f.append(str(hdf5_getters.get_loudness(h5))) f.append(str(hdf5_getters.get_tempo(h5))) f.append(str(hdf5_getters.get_time_signature(h5))) f.append(str(hdf5_getters.get_key(h5))) f.append(str(hdf5_getters.get_mode(h5))) f.append(str(hdf5_getters.get_duration(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_timbre(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_pitches(h5))) f.extend(get_statistical_feats(hdf5_getters.get_segments_loudness_max(h5))) f.extend( get_statistical_feats(hdf5_getters.get_segments_loudness_max_time(h5))) f.extend( get_statistical_feats(hdf5_getters.get_segments_loudness_start(h5))) f.append(str(hdf5_getters.get_song_hotttnesss(h5))) f.append(str(hdf5_getters.get_danceability(h5))) f.append(str(hdf5_getters.get_end_of_fade_in(h5))) f.append(str(hdf5_getters.get_energy(h5))) f.append(str(hdf5_getters.get_start_of_fade_out(h5))) f.append(str(hdf5_getters.get_year(h5))) return f
str(e) for e in GETTERS.get_artist_mbtags(h5, i)) # array artist_mbtags_count = ','.join( str(e) for e in GETTERS.get_artist_mbtags_count(h5, i)) # array artist_name = GETTERS.get_artist_name(h5, i) artist_playmeid = GETTERS.get_artist_playmeid(h5, i) artist_terms = ','.join( str(e) for e in GETTERS.get_artist_terms(h5, i)) # array #artist_terms_freq = ','.join(str(e) for e in GETTERS.get_artist_terms_freq(h5, i)) # array #artist_terms_weight = ','.join(str(e) for e in GETTERS.get_artist_terms_weight(h5, i)) # array #audio_md5 = GETTERS.get_audio_md5(h5, i) #bars_confidence = ','.join(str(e) for e in GETTERS.get_bars_confidence(h5, i)) # array #bars_start = ','.join(str(e) for e in GETTERS.get_bars_start(h5, i)) # array #beats_confidence = ','.join(str(e) for e in GETTERS.get_beats_confidence(h5, i)) # array #beats_start = ','.join(str(e) for e in GETTERS.get_beats_start(h5, i)) # array danceability = GETTERS.get_danceability(h5, i) duration = GETTERS.get_duration(h5, i) end_of_fade_in = GETTERS.get_end_of_fade_in(h5, i) energy = GETTERS.get_energy(h5, i) key = GETTERS.get_key(h5, i) key_confidence = GETTERS.get_key_confidence(h5, i) loudness = GETTERS.get_loudness(h5, i) mode = GETTERS.get_mode(h5, i) mode_confidence = GETTERS.get_mode_confidence(h5, i) release = GETTERS.get_release(h5, i) release_7digitalid = GETTERS.get_release_7digitalid(h5, i) #sections_confidence = ','.join(str(e) for e in GETTERS.get_sections_confidence(h5, i)) # array #sections_start = ','.join(str(e) for e in GETTERS.get_sections_start(h5, i)) # array #segments_confidence = ','.join(str(e) for e in GETTERS.get_segments_confidence(h5, i)) # array #segments_loudness_max = ','.join(str(e) for e in GETTERS.get_segments_loudness_max(h5, i)) # array #segments_loudness_max_time = ','.join(str(e) for e in GETTERS.get_segments_loudness_max_time(h5, i)) # array
def main(): outputFileName = sys.argv[2] outputFile1 = open(outputFileName, 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" # outputFile1.write(csvRowString); csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ( "SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation," + "ArtistLongitude,ArtistFamiliarity,ArtistHotttnesss,ArtistName," + "ArtistMBTags,ArtistTerms," + "Danceability,Energy,Duration,KeySignature," + "KeySignatureConfidence,Loudness,Mode,Hotttnesss,Tempo,TimeSignature,TimeSignatureConfidence," + "Title,Year") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() # outputFile1.write("SongNumber,"); # outputFile1.write(csvRowString + "\n"); csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = sys.argv[1] # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File)) song.artistFamiliarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.artistHotttnesss = str( hdf5_getters.get_artist_hotttnesss(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.artistMBTags = ','.join( hdf5_getters.get_artist_mbtags(songH5File)) # song.artistMBTagsCount = ','.join(hdf5_getters.get_artist_mbtags_count(songH5File)) song.artistTerms = ','.join( hdf5_getters.get_artist_terms(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) # song.lyrics = None # song.popularity = None song.hotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) #print song count # csvRowString += str(song.songCount) + "," rowString = json.dumps({ 'AlbumID': song.albumID, 'AlbumName': song.albumName, 'ArtistID': song.artistID, 'ArtistLatitude': song.artistLatitude, 'ArtistLocation': song.artistLocation, 'ArtistLongitude': song.artistLongitude, 'ArtistFamiliarity': song.artistFamiliarity, 'ArtistHotttnesss': song.artistHotttnesss, 'ArtistName': song.artistName, 'ArtistMBTags': song.artistMBTags, 'ArtistTerms': song.artistTerms, 'Danceability': song.danceability, 'Energy': song.energy, 'Duration': song.duration, 'KeySignature': song.keySignature, 'KeySignatureConfidence': song.keySignatureConfidence, 'Loudness': song.loudness, 'Mode': song.mode, 'Hotttnesss': song.hotttnesss, 'Tempo': song.tempo, 'SongID': song.id, 'TimeSignature': song.timeSignature, 'TimeSignatureConfidence': song.timeSignatureConfidence, 'Title': song.title, 'Year': song.year, }) #Remove the final comma from each row in the csv rowString += "\n" outputFile1.write(rowString) songH5File.close() outputFile1.close()
artist_location = GETTERS.get_artist_location(h5, i) artist_longitude = GETTERS.get_artist_longitude(h5, i) artist_mbid = GETTERS.get_artist_mbid(h5, i) artist_mbtags = ','.join(str(e) for e in GETTERS.get_artist_mbtags(h5, i)) # array artist_mbtags_count = ','.join(str(e) for e in GETTERS.get_artist_mbtags_count(h5, i)) # array artist_name = GETTERS.get_artist_name(h5, i) artist_playmeid = GETTERS.get_artist_playmeid(h5, i) artist_terms = ','.join(str(e) for e in GETTERS.get_artist_terms(h5, i)) # array #artist_terms_freq = ','.join(str(e) for e in GETTERS.get_artist_terms_freq(h5, i)) # array #artist_terms_weight = ','.join(str(e) for e in GETTERS.get_artist_terms_weight(h5, i)) # array #audio_md5 = GETTERS.get_audio_md5(h5, i) #bars_confidence = ','.join(str(e) for e in GETTERS.get_bars_confidence(h5, i)) # array #bars_start = ','.join(str(e) for e in GETTERS.get_bars_start(h5, i)) # array #beats_confidence = ','.join(str(e) for e in GETTERS.get_beats_confidence(h5, i)) # array #beats_start = ','.join(str(e) for e in GETTERS.get_beats_start(h5, i)) # array danceability = GETTERS.get_danceability(h5, i) duration = GETTERS.get_duration(h5, i) end_of_fade_in = GETTERS.get_end_of_fade_in(h5, i) energy = GETTERS.get_energy(h5, i) key = GETTERS.get_key(h5, i) key_confidence = GETTERS.get_key_confidence(h5, i) loudness = GETTERS.get_loudness(h5, i) mode = GETTERS.get_mode(h5, i) mode_confidence = GETTERS.get_mode_confidence(h5, i) release = GETTERS.get_release(h5, i) release_7digitalid = GETTERS.get_release_7digitalid(h5, i) #sections_confidence = ','.join(str(e) for e in GETTERS.get_sections_confidence(h5, i)) # array #sections_start = ','.join(str(e) for e in GETTERS.get_sections_start(h5, i)) # array #segments_confidence = ','.join(str(e) for e in GETTERS.get_segments_confidence(h5, i)) # array #segments_loudness_max = ','.join(str(e) for e in GETTERS.get_segments_loudness_max(h5, i)) # array #segments_loudness_max_time = ','.join(str(e) for e in GETTERS.get_segments_loudness_max_time(h5, i)) # array
def writeSingleHDF5FileToTxtFile(songHDF5FileName): global maximumArtistNameLen global maximumArtistTagLen global maximumSongNameLen global maximumAlbumNameLen """ This function does 3 simple things: - open the song file - get artist ID and put it - close the file """ songHDF5File = GETTERS.open_h5_file_read(songHDF5FileName) songID = GETTERS.get_song_id(songHDF5File) songName = GETTERS.get_title(songHDF5File) artistID = GETTERS.get_artist_id(songHDF5File) songAlbum = GETTERS.get_release(songHDF5File) songYear = GETTERS.get_year(songHDF5File) songTempo = GETTERS.get_tempo(songHDF5File) songDanceability = GETTERS.get_danceability(songHDF5File) songDuration = GETTERS.get_duration(songHDF5File) songEnergy = GETTERS.get_energy(songHDF5File) songKey = GETTERS.get_key(songHDF5File) songLoudness = GETTERS.get_loudness(songHDF5File) songMode = GETTERS.get_mode(songHDF5File) songTimeSignature = GETTERS.get_time_signature(songHDF5File) songsTableFile.write(songID + "\t" + songName + "\t" + artistID + "\t" + songAlbum + "\t" + str(songYear) + "\t" + str(songTempo) + "\t" + str(songDanceability) + "\t" + str(songDuration) + "\t" + str(songEnergy) + "\t" + str(songKey) + "\t" + str(songLoudness) + "\t" + str(songMode) + "\t" + str(songTimeSignature) + "\t\n") artistName = GETTERS.get_artist_name(songHDF5File) artistFamiliarity = GETTERS.get_artist_familiarity(songHDF5File) artistTagsArray = GETTERS.get_artist_mbtags(songHDF5File) artistsTableFile.write(artistID + "\t" + artistName + "\t" + str(artistFamiliarity) + "\t\n") if len(songName) > maximumSongNameLen: maximumSongNameLen = len(songName) if len(songAlbum) > maximumAlbumNameLen: maximumAlbumNameLen = len(songAlbum) if len(artistName) > maximumArtistNameLen: maximumArtistNameLen = len(artistName) for artistTag in artistTagsArray: if artistTag in allowedTagsSet: artistsTagsTableFile.write(artistID + "\t" + artistTag + "\t\n") if artistTag not in tagsSet: tagsTableFile.write(artistTag + "\t\n") tagsSet.add(artistTag) if len(artistTag) > maximumArtistTagLen: maximumArtistTagLen = len(artistTag) similarArtists = GETTERS.get_similar_artists(songHDF5File) for similarArtist in similarArtists: similarArtistsPairsList.add((artistID, similarArtist)) artistsIDsSet.add(artistID) artistsNamesSet.add(artistName) songHDF5File.close()
def main(): outputFile = open('songs.csv', 'w') writer = csv.writer(outputFile) csvRowString = "song_number,artist_familiarity,artist_hotttnesss,artist_id,artist_mbid,artist_playmeid,artist_7digitalid,artist_latitude,artist_longitude,artist_location,artist_name,release,release_7digitalid,song_id,song_hotttnesss,title,track_7digitalid,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,year" outputFile.write(csvRowString + "\n") csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "." # "." As the default means the current directory ext = ".H5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP songCount = 0 for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) values = [ songCount, hdf5_getters.get_artist_familiarity(songH5File), hdf5_getters.get_artist_hotttnesss(songH5File), hdf5_getters.get_artist_id(songH5File), hdf5_getters.get_artist_mbid(songH5File), hdf5_getters.get_artist_playmeid(songH5File), hdf5_getters.get_artist_7digitalid(songH5File), hdf5_getters.get_artist_latitude(songH5File), hdf5_getters.get_artist_longitude(songH5File), hdf5_getters.get_artist_location(songH5File), hdf5_getters.get_artist_name(songH5File), hdf5_getters.get_release(songH5File), hdf5_getters.get_release_7digitalid(songH5File), hdf5_getters.get_song_id(songH5File), hdf5_getters.get_song_hotttnesss(songH5File), hdf5_getters.get_title(songH5File), hdf5_getters.get_track_7digitalid(songH5File), hdf5_getters.get_analysis_sample_rate(songH5File), hdf5_getters.get_audio_md5(songH5File), hdf5_getters.get_danceability(songH5File), hdf5_getters.get_duration(songH5File), hdf5_getters.get_end_of_fade_in(songH5File), hdf5_getters.get_energy(songH5File), hdf5_getters.get_key(songH5File), hdf5_getters.get_key_confidence(songH5File), hdf5_getters.get_loudness(songH5File), hdf5_getters.get_mode(songH5File), hdf5_getters.get_mode_confidence(songH5File), hdf5_getters.get_start_of_fade_out(songH5File), hdf5_getters.get_tempo(songH5File), hdf5_getters.get_time_signature(songH5File), hdf5_getters.get_time_signature_confidence(songH5File), hdf5_getters.get_track_id(songH5File), hdf5_getters.get_year(songH5File) ] songH5File.close() songCount = songCount + 1 writer.writerow(values) outputFile.close()
def main(): outputFile1 = open('SongCSV.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Familiarity'.lower(): ####Added by us! csvRowString += song.familiarity elif attribute == 'artist_mbid'.lower(): csvRowString += song.artist_mbid elif attribute == 'artist_playmeid'.lower(): csvRowString += song.artist_playmeid elif attribute == 'artist_7digid'.lower(): csvRowString += song.artist_7digid elif attribute == 'hottness'.lower(): csvRowString += song.hottness elif attribute == 'song_hottness'.lower(): csvRowString += song.song_hottness elif attribute == 'digitalid7'.lower(): csvRowString += song.digitalid7 elif attribute == 'similar_artists'.lower(): csvRowString += song.similar_artists elif attribute == 'artist_terms'.lower(): csvRowString += song.artist_terms elif attribute == 'art_terms_freq'.lower(): csvRowString += song.art_terms_freq elif attribute == 'art_terms_weight'.lower(): csvRowString += song.art_terms_weight elif attribute == 'a_sample_rate'.lower(): csvRowString += song.a_sample_rate elif attribute == 'audio_md5'.lower(): csvRowString += song.audio_md5 elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'energy'.lower(): csvRowString += song.energy elif attribute == 'loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_conf'.lower(): csvRowString += song.mode_conf elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out elif attribute == 'trackid'.lower(): csvRowString += song.trackid elif attribute == 'segm_start'.lower(): csvRowString += song.segm_start elif attribute == 'segm_conf'.lower(): csvRowString += song.segm_conf elif attribute == 'segm_pitch'.lower(): csvRowString += song.segm_pitch elif attribute == 'segm_timbre'.lower(): csvRowString += song.segm_timbre elif attribute == 'segm_max_loud'.lower(): csvRowString += song.segm_max_loud elif attribute == 'segm_max_loud_time'.lower(): csvRowString += song.segm_max_loud_time elif attribute == 'segm_loud_start'.lower(): csvRowString += song.segm_loud_start elif attribute == 'sect_start'.lower(): csvRowString += song.sect_start elif attribute == 'sect_conf'.lower(): csvRowString += song.sect_conf elif attribute == 'beats_start'.lower(): csvRowString += song.beats_start elif attribute == 'beats_conf'.lower(): csvRowString += song.beats_conf elif attribute == 'bars_start'.lower(): csvRowString += song.bars_start elif attribute == 'bars_conf'.lower(): csvRowString += song.bars_conf elif attribute == 'tatums_start'.lower(): csvRowString += song.tatums_start elif attribute == 'tatums_conf'.lower(): csvRowString += song.tatums_conf elif attribute == 'artist_mbtags'.lower(): csvRowString += song.artist_mbtags elif attribute == 'artist_mbtags_count'.lower(): csvRowString += song.artist_mbtags_count elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print("==============") print("I believe there has been an error with the input.") print("==============") break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = "SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Familiarity,Artist_Mbid,Artist_PlaymeId,Artist_7didId,Hottness,Song_Hottness,7digitalid,A_Sample_Rate,Audio_Md5,End_Of_Fade_In,Energy,Loudness,Mode,Mode_Conf,Start_Of_Fade_Out,TrackId" ################################################# csvAttributeList = re.split(',', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "/home/bigdata/smalltest/" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) # testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) #########Added by us! song.familiarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.artist_mbid = str(hdf5_getters.get_artist_mbid(songH5File)) song.artist_playmeid = str( hdf5_getters.get_artist_playmeid(songH5File)) song.artist_7digid = str( hdf5_getters.get_artist_7digitalid(songH5File)) song.hottness = str(hdf5_getters.get_artist_hotttnesss(songH5File)) song.song_hottness = str( hdf5_getters.get_song_hotttnesss(songH5File)) song.digitalid7 = str( hdf5_getters.get_track_7digitalid(songH5File)) #song.similar_artists = str(hdf5_getters.get_similar_artists(songH5File)) #song.artist_terms = str(hdf5_getters.get_artist_terms(songH5File)) #song.art_terms_freq = str(hdf5_getters.get_artist_terms_freq(songH5File)) #song.art_terms_weight = str(hdf5_getters.get_artist_terms_weight(songH5File)) song.a_sample_rate = str( hdf5_getters.get_analysis_sample_rate(songH5File)) song.audio_md5 = str(hdf5_getters.get_audio_md5(songH5File)) song.end_of_fade_in = str( hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.mode_conf = str(hdf5_getters.get_mode_confidence(songH5File)) song.start_of_fade_out = str( hdf5_getters.get_start_of_fade_out(songH5File)) song.trackid = str(hdf5_getters.get_track_id(songH5File)) #song.segm_start = str(hdf5_getters.get_segments_start(songH5File)) #song.segm_conf = str(hdf5_getters.get_segments_confidence(songH5File)) #song.segm_pitch = str(hdf5_getters.get_segments_pitches(songH5File)) #song.segm_timbre = str(hdf5_getters.get_segments_timbre(songH5File)) #song.segm_max_loud = str(hdf5_getters.get_segments_loudness_max(songH5File)) #song.segm_max_loud_time = str(hdf5_getters.get_segments_loudness_max_time(songH5File)) #song.segm_loud_start = str(hdf5_getters.get_segments_loudness_start(songH5File)) #song.sect_start = str(hdf5_getters.get_sections_start(songH5File)) #song.sect_conf = str(hdf5_getters.get_sections_confidence(songH5File)) #song.beats_start = str(hdf5_getters.get_beats_start(songH5File)) #song.beats_conf = str(hdf5_getters.get_beats_confidence(songH5File)) #song.bars_start = str(hdf5_getters.get_bars_start(songH5File)) #song.bars_conf = str(hdf5_getters.get_bars_confidence(songH5File)) #song.tatums_start = str(hdf5_getters.get_tatums_start(songH5File)) #song.tatums_conf = str(hdf5_getters.get_tatums_confidence(songH5File)) #song.artist_mbtags = str(hdf5_getters.get_artist_mbtags(songH5File)) #song.artist_mbtags_count = str(hdf5_getters.get_artist_mbtags_count(songH5File)) #print song count #csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace("b\"", "") albumName = albumName.replace("\"", "") albumName = albumName.replace(',', "") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',', '') location = location.replace("b\"", "") location = location.replace("\"", "") csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): artistName = song.artistName artistName = artistName.replace("b\"", "") artistName = artistName.replace("\"", "") csvRowString += "\"" + artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): t = song.title t = t.replace("b\"", "") t = t.replace("\"", "") csvRowString += "\"" + t + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'Familiarity'.lower(): ####Added by us! csvRowString += song.familiarity elif attribute == 'artist_mbid'.lower(): csvRowString += "\"" + song.artist_mbid + "\"" elif attribute == 'artist_playmeid'.lower(): csvRowString += song.artist_playmeid elif attribute == 'artist_7digid'.lower(): csvRowString += song.artist_7digid elif attribute == 'hottness'.lower(): csvRowString += song.hottness elif attribute == 'song_hottness'.lower(): csvRowString += song.song_hottness elif attribute == 'digitalid7'.lower(): csvRowString += song.digitalid7 elif attribute == 'similar_artists'.lower(): csvRowString += song.similar_artists elif attribute == 'artist_terms'.lower(): csvRowString += song.artist_terms elif attribute == 'art_terms_freq'.lower(): csvRowString += song.art_terms_freq elif attribute == 'art_terms_weight'.lower(): csvRowString += song.art_terms_weight elif attribute == 'a_sample_rate'.lower(): csvRowString += song.a_sample_rate elif attribute == 'audio_md5'.lower(): csvRowString += "\"" + song.audio_md5 + "\"" elif attribute == 'end_of_fade_in'.lower(): csvRowString += song.end_of_fade_in elif attribute == 'energy'.lower(): csvRowString += song.energy elif attribute == 'loudness'.lower(): csvRowString += song.loudness elif attribute == 'mode'.lower(): csvRowString += song.mode elif attribute == 'mode_conf'.lower(): csvRowString += song.mode_conf elif attribute == 'start_of_fade_out'.lower(): csvRowString += song.start_of_fade_out elif attribute == 'trackid'.lower(): csvRowString += "\"" + song.trackid + "\"" elif attribute == 'segm_start'.lower(): csvRowString += song.segm_start elif attribute == 'segm_conf'.lower(): csvRowString += song.segm_conf elif attribute == 'segm_pitch'.lower(): csvRowString += song.segm_pitch elif attribute == 'segm_timbre'.lower(): csvRowString += song.segm_timbre elif attribute == 'segm_max_loud'.lower(): csvRowString += song.segm_max_loud elif attribute == 'segm_max_loud_time'.lower(): csvRowString += song.segm_max_loud_time elif attribute == 'segm_loud_start'.lower(): csvRowString += song.segm_loud_start elif attribute == 'sect_start'.lower(): csvRowString += song.sect_start elif attribute == 'sect_conf'.lower(): csvRowString += song.sect_conf elif attribute == 'beats_start'.lower(): csvRowString += song.beats_start elif attribute == 'beats_conf'.lower(): csvRowString += song.beats_conf elif attribute == 'bars_start'.lower(): csvRowString += song.bars_start elif attribute == 'bars_conf'.lower(): csvRowString += song.bars_conf elif attribute == 'tatums_start'.lower(): csvRowString += song.tatums_start elif attribute == 'tatums_conf'.lower(): csvRowString += song.tatums_conf elif attribute == 'artist_mbtags'.lower(): csvRowString += song.artist_mbtags elif attribute == 'artist_mbtags_count'.lower(): csvRowString += song.artist_mbtags_count else: csvRowString += "\"ERR\"" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def data_to_flat_file(basedir,ext='.h5') : """ This function extracts the information from the tables and creates the flat file. """ count = 0; #song counter list_to_write= [] group_index=0 row_to_write = "" writer = csv.writer(open("complete.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: row=[] print f h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') row.append(title) comma=title.find(',') if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') row.append(album) comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') row.append(artist_name) duration = hdf5_getters.get_duration(h5) row.append(duration) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) row.append(samp_rt) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) row.append(artist_7digitalid) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 row.append(artist_fam) artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 row.append(artist_hotness) artist_id = hdf5_getters.get_artist_id(h5) row.append(artist_id) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 row.append(artist_lat) artist_loc = hdf5_getters.get_artist_location(h5) row.append(artist_loc) artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 row.append(artist_lon) artist_mbid = hdf5_getters.get_artist_mbid(h5) row.append(artist_mbid) #Getting the genre art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq genre_set=0 #flag to see if the genre has been set or not final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 if genre_set == 1: col_num=[] for i in final_genre: column=int(i) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) else: genre_array=genre_columns(-1) #when there is no genre matched, return an array of [0...0] for i in range(len(genre_array)): #appending the genre_array to the row row.append(genre_array[i]) artist_pmid = hdf5_getters.get_artist_playmeid(h5) row.append(artist_pmid) audio_md5 = hdf5_getters.get_audio_md5(h5) row.append(audio_md5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 row.append(danceability) end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 row.append(end_fade_in) energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 row.append(energy) song_key = hdf5_getters.get_key(h5) row.append(song_key) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 row.append(key_c) loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 row.append(loudness) mode = hdf5_getters.get_mode(h5) row.append(mode) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 row.append(mode_conf) release_7digitalid = hdf5_getters.get_release_7digitalid(h5) row.append(release_7digitalid) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 row.append(song_hot) song_id = hdf5_getters.get_song_id(h5) row.append(song_id) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) row.append(start_fade_out) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 row.append(tempo) time_sig = hdf5_getters.get_time_signature(h5) row.append(time_sig) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 row.append(time_sig_c) track_id = hdf5_getters.get_track_id(h5) row.append(track_id) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) row.append(track_7digitalid) year = hdf5_getters.get_year(h5) row.append(year) bars_c = hdf5_getters.get_bars_confidence(h5) bars_start = hdf5_getters.get_bars_start(h5) row_bars_padding=padding(245) #this is the array that will be attached at the end of th row #--------------bars---------------" gral_info=[] gral_info=row[:] empty=[] for i,item in enumerate(bars_c): row.append(group_index) row.append(i) row.append(bars_c[i]) bars_c_avg= get_avg(bars_c) row.append(bars_c_avg) bars_c_max= get_max(bars_c) row.append(bars_c_max) bars_c_min = get_min(bars_c) row.append(bars_c_min) bars_c_stddev= get_stddev(bars_c) row.append(bars_c_stddev) bars_c_count = get_count(bars_c) row.append(bars_c_count) bars_c_sum = get_sum(bars_c) row.append(bars_c_sum) row.append(bars_start[i]) bars_start_avg = get_avg(bars_start) row.append(bars_start_avg) bars_start_max= get_max(bars_start) row.append(bars_start_max) bars_start_min = get_min(bars_start) row.append(bars_start_min) bars_start_stddev= get_stddev(bars_start) row.append(bars_start_stddev) bars_start_count = get_count(bars_start) row.append(bars_start_count) bars_start_sum = get_sum(bars_start) row.append(bars_start_sum) for i in row_bars_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------beats---------------" beats_c = hdf5_getters.get_beats_confidence(h5) group_index=1 row=[] row=gral_info[:] row_front=padding(14) #blanks left in front of the row(empty spaces for bars) row_beats_padding=padding(231) for i,item in enumerate(beats_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the beats row.append(index) row.append(beats_c[i]) beats_c_avg= get_avg(beats_c) row.append(beats_c_avg) beats_c_max= get_max(beats_c) row.append(beats_c_max) beats_c_min = get_min(beats_c) row.append(beats_c_min) beats_c_stddev= get_stddev(beats_c) row.append(beats_c_stddev) beats_c_count = get_count(beats_c) row.append(beats_c_count) beats_c_sum = get_sum(beats_c) row.append(beats_c_sum) beats_start = hdf5_getters.get_beats_start(h5) row.append(beats_start[i]) beats_start_avg = get_avg(beats_start) row.append(beats_start_avg) beats_start_max= get_max(beats_start) row.append(beats_start_max) beats_start_min = get_min(beats_start) row.append(beats_start_min) beats_start_stddev= get_stddev(beats_start) row.append(beats_start_stddev) beats_start_count = get_count(beats_start) row.append(beats_start_count) beats_start_sum = get_sum(beats_start) row.append(beats_start_sum) for i in row_beats_padding: row.append(i) writer.writerow(row) row=[] row=gral_info[:] # "--------sections---------------" row_sec_padding=padding(217) #blank spaces left at the end of the row sec_c = hdf5_getters.get_sections_confidence(h5) group_index=2 row=[] row=gral_info[:] row_front=padding(28) #blank spaces left in front(empty spaces for bars,beats) for i,item in enumerate(sec_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the sections row.append(index) row.append(sec_c[i]) sec_c_avg= get_avg(sec_c) row.append(sec_c_avg) sec_c_max= get_max(sec_c) row.append(sec_c_max) sec_c_min = get_min(sec_c) row.append(sec_c_min) sec_c_stddev= get_stddev(sec_c) row.append(sec_c_stddev) sec_c_count = get_count(sec_c) row.append(sec_c_count) sec_c_sum = get_sum(sec_c) row.append(sec_c_sum) sec_start = hdf5_getters.get_sections_start(h5) row.append(sec_start[i]) sec_start_avg = get_avg(sec_start) row.append(sec_start_avg) sec_start_max= get_max(sec_start) row.append(sec_start_max) sec_start_min = get_min(sec_start) row.append(sec_start_min) sec_start_stddev= get_stddev(sec_start) row.append(sec_start_stddev) sec_start_count = get_count(sec_start) row.append(sec_start_count) sec_start_sum = get_sum(sec_start) row.append(sec_start_sum) for i in row_sec_padding: #appending the blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #--------segments---------------" row_seg_padding=padding(182) #blank spaces at the end of the row row_front=padding(42) #blank spaces left in front of segments seg_c = hdf5_getters.get_segments_confidence(h5) group_index=3 row=[] row=gral_info[:] for i,item in enumerate(seg_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the segments row.append(index) row.append(seg_c[i]) seg_c_avg= get_avg(seg_c) row.append(seg_c_avg) seg_c_max= get_max(seg_c) row.append(seg_c_max) seg_c_min = get_min(seg_c) row.append(seg_c_min) seg_c_stddev= get_stddev(seg_c) row.append(seg_c_stddev) seg_c_count = get_count(seg_c) row.append(seg_c_count) seg_c_sum = get_sum(seg_c) row.append(seg_c_sum) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) row.append(seg_loud_max[i]) seg_loud_max_avg= get_avg(seg_loud_max) row.append(seg_loud_max_avg) seg_loud_max_max= get_max(seg_loud_max) row.append(seg_loud_max_max) seg_loud_max_min = get_min(seg_loud_max) row.append(seg_loud_max_min) seg_loud_max_stddev= get_stddev(seg_loud_max) row.append(seg_loud_max_stddev) seg_loud_max_count = get_count(seg_loud_max) row.append(seg_loud_max_count) seg_loud_max_sum = get_sum(seg_loud_max) row.append(seg_loud_max_sum) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) row.append(seg_loud_max_time[i]) seg_loud_max_time_avg= get_avg(seg_loud_max_time) row.append(seg_loud_max_time_avg) seg_loud_max_time_max= get_max(seg_loud_max_time) row.append(seg_loud_max_time_max) seg_loud_max_time_min = get_min(seg_loud_max_time) row.append(seg_loud_max_time_min) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) row.append(seg_loud_max_time_stddev) seg_loud_max_time_count = get_count(seg_loud_max_time) row.append(seg_loud_max_time_count) seg_loud_max_time_sum = get_sum(seg_loud_max_time) row.append(seg_loud_max_time_sum) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) row.append(seg_loud_start[i]) seg_loud_start_avg= get_avg(seg_loud_start) row.append(seg_loud_start_avg) seg_loud_start_max= get_max(seg_loud_start) row.append(seg_loud_start_max) seg_loud_start_min = get_min(seg_loud_start) row.append(seg_loud_start_min) seg_loud_start_stddev= get_stddev(seg_loud_start) row.append(seg_loud_start_stddev) seg_loud_start_count = get_count(seg_loud_start) row.append(seg_loud_start_count) seg_loud_start_sum = get_sum(seg_loud_start) row.append(seg_loud_start_sum) seg_start = hdf5_getters.get_segments_start(h5) row.append(seg_start[i]) seg_start_avg= get_avg(seg_start) row.append(seg_start_avg) seg_start_max= get_max(seg_start) row.append(seg_start_max) seg_start_min = get_min(seg_start) row.append(seg_start_min) seg_start_stddev= get_stddev(seg_start) row.append(seg_start_stddev) seg_start_count = get_count(seg_start) row.append(seg_start_count) seg_start_sum = get_sum(seg_start) row.append(seg_start_sum) for i in row_seg_padding: #appending blank spaces at the end of the row row.append(i) writer.writerow(row) row=[] row=gral_info[:] #----------segments pitch and timbre---------------" row_seg2_padding=padding(14) #blank spaces left at the end of the row row_front=padding(77) #blank spaces left at the front of the segments and timbre seg_pitch = hdf5_getters.get_segments_pitches(h5) transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows group_index=4 row=[] row=gral_info[:] for i,item in enumerate(transpose_pitch[0]): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of segments and timbre row.append(index) row.append(transpose_pitch[0][i]) seg_pitch_avg= get_avg(transpose_pitch[0]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[0]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[0]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[0]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[0]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[0]) row.append(seg_pitch_sum) row.append(transpose_pitch[1][i]) seg_pitch_avg= get_avg(transpose_pitch[1]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[1]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[1]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[1]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[1]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[1]) row.append(seg_pitch_sum) row.append(transpose_pitch[2][i]) seg_pitch_avg= get_avg(transpose_pitch[2]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[2]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[2]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[2]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[2]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[2]) row.append(seg_pitch_sum) row.append(transpose_pitch[3][i]) seg_pitch_avg= get_avg(transpose_pitch[3]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[3]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[3]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[3]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[3]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[3]) row.append(seg_pitch_sum) row.append(transpose_pitch[4][i]) seg_pitch_avg= get_avg(transpose_pitch[4]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[4]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[4]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[4]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[4]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[4]) row.append(seg_pitch_sum) row.append(transpose_pitch[5][i]) seg_pitch_avg= get_avg(transpose_pitch[5]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[5]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[5]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[5]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[5]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[5]) row.append(seg_pitch_sum) row.append(transpose_pitch[6][i]) seg_pitch_avg= get_avg(transpose_pitch[6]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[6]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[6]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[6]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[6]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[6]) row.append(seg_pitch_sum) row.append(transpose_pitch[7][i]) seg_pitch_avg= get_avg(transpose_pitch[7]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[7]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[7]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[7]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[7]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[7]) row.append(seg_pitch_sum) row.append(transpose_pitch[8][i]) seg_pitch_avg= get_avg(transpose_pitch[8]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[8]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[8]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[8]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[8]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[8]) row.append(seg_pitch_sum) row.append(transpose_pitch[9][i]) seg_pitch_avg= get_avg(transpose_pitch[9]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[9]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[9]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[9]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[9]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[9]) row.append(seg_pitch_sum) row.append(transpose_pitch[10][i]) seg_pitch_avg= get_avg(transpose_pitch[10]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[10]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[10]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[10]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[10]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[10]) row.append(seg_pitch_sum) row.append(transpose_pitch[11][i]) seg_pitch_avg= get_avg(transpose_pitch[11]) row.append(seg_pitch_avg) seg_pitch_max= get_max(transpose_pitch[11]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[11]) row.append(seg_pitch_min) seg_pitch_stddev= get_stddev(transpose_pitch[11]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[11]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[11]) row.append(seg_pitch_sum) #timbre arrays seg_timbre = hdf5_getters.get_segments_timbre(h5) transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows row.append(transpose_timbre[0][i]) seg_timbre_avg= get_avg(transpose_timbre[0]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[0]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[0]) row.append(seg_timbre_min) seg_timbre_stddev=get_stddev(transpose_timbre[0]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[0]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[0]) row.append(seg_timbre_sum) row.append(transpose_timbre[1][i]) seg_timbre_avg= get_avg(transpose_timbre[1]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[1]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[1]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[1]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[1]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[1]) row.append(seg_timbre_sum) row.append(transpose_timbre[2][i]) seg_timbre_avg= get_avg(transpose_timbre[2]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[2]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[2]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[2]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[2]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[2]) row.append(seg_timbre_sum) row.append(transpose_timbre[3][i]) seg_timbre_avg= get_avg(transpose_timbre[3]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[3]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[3]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[3]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[3]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[3]) row.append(seg_timbre_sum) row.append(transpose_timbre[4][i]) seg_timbre_avg= get_avg(transpose_timbre[4]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[4]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[4]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[4]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[4]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[4]) row.append(seg_timbre_sum) row.append(transpose_timbre[5][i]) seg_timbre_avg= get_avg(transpose_timbre[5]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[5]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[5]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[5]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[5]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[5]) row.append(seg_timbre_sum) row.append(transpose_timbre[6][i]) seg_timbre_avg= get_avg(transpose_timbre[6]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[6]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[6]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[6]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[6]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[6]) row.append(seg_timbre_sum) row.append(transpose_timbre[7][i]) seg_timbre_avg= get_avg(transpose_timbre[7]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[7]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[7]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[7]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[7]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[7]) row.append(seg_timbre_sum) row.append(transpose_timbre[8][i]) seg_timbre_avg= get_avg(transpose_timbre[8]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[8]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[8]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[8]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[8]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[8]) row.append(seg_timbre_sum) row.append(transpose_timbre[9][i]) seg_timbre_avg= get_avg(transpose_timbre[9]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[9]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[9]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[9]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[9]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[9]) row.append(seg_timbre_sum) row.append(transpose_timbre[10][i]) seg_timbre_avg= get_avg(transpose_timbre[10]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[10]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[10]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[10]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[10]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[10]) row.append(seg_timbre_sum) row.append(transpose_timbre[11][i]) seg_timbre_avg= get_avg(transpose_timbre[11]) row.append(seg_timbre_avg) seg_timbre_max= get_max(transpose_timbre[11]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[11]) row.append(seg_timbre_min) seg_timbre_stddev= get_stddev(transpose_timbre[11]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[11]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[11]) row.append(seg_timbre_sum) for item in row_seg2_padding: row.append(item) writer.writerow(row) row=[] row=gral_info[:] # "--------tatums---------------" tatms_c = hdf5_getters.get_tatums_confidence(h5) group_index=5 row_front=padding(245) #blank spaces left in front of tatums row=[] row=gral_info[:] for i,item in enumerate(tatms_c): row.append(group_index) row.append(i) for item in row_front: #appending blank spaces at the front of the row row.append(item) row.append(tatms_c[i]) tatms_c_avg= get_avg(tatms_c) row.append(tatms_c_avg) tatms_c_max= get_max(tatms_c) row.append(tatms_c_max) tatms_c_min = get_min(tatms_c) row.append(tatms_c_min) tatms_c_stddev= get_stddev(tatms_c) row.append(tatms_c_stddev) tatms_c_count = get_count(tatms_c) row.append(tatms_c_count) tatms_c_sum = get_sum(tatms_c) row.append(tatms_c_sum) tatms_start = hdf5_getters.get_tatums_start(h5) row.append(tatms_start[i]) tatms_start_avg= get_avg(tatms_start) row.append(tatms_start_avg) tatms_start_max= get_max(tatms_start) row.append(tatms_start_max) tatms_start_min = get_min(tatms_start) row.append(tatms_start_min) tatms_start_stddev= get_stddev(tatms_start) row.append(tatms_start_stddev) tatms_start_count = get_count(tatms_start) row.append(tatms_start_count) tatms_start_sum = get_sum(tatms_start) row.append(tatms_start_sum) writer.writerow(row) row=[] row=gral_info[:] transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 h5.close() count=count+1; print count;
track_duration = str(h.get_duration(h5, 0)) track_year = str(h.get_year(h5, 0)) cursor.execute("SELECT * FROM track WHERE track_id = '" + track_id + "'") rs = cursor.fetchall() if cursor.rowcount != 1: cursor.execute("INSERT INTO track VALUES ('" + track_id + "','" + track_title + "','" + artist_id + "','" + artist_name + "','" + track_album + "'," + track_duration + "," + track_year + ");") ''' Store track_analysis tuples ''' print("Track ID: " + h.get_track_id(h5, 0)) track_tempo = str(h.get_tempo(h5, 0)) track_key = str(h.get_key(h5, 0)) track_danceability = str(h.get_danceability(h5, 0)) if track_danceability == "nan": track_danceability = "0.0" track_hottness = str(h.get_song_hotttnesss(h5, 0)) if track_hottness == "nan": track_hottness = "0.0" cursor.execute("SELECT * FROM track_analysis WHERE track_id = '" + track_id + "'") rs = cursor.fetchall() if cursor.rowcount != 1: cursor.execute("INSERT INTO track_analysis VALUES ('" + track_id + "'," + track_tempo + "," + track_key + "," + track_danceability + "," + track_hottness + ");")
def get_all_files(basedir,ext='.h5') : """ From a root directory, go through all subdirectories and find all files with the given extension. Return all absolute paths in a list. """ c=0 title=[]#get_title(h5) name=[]#get_artist_name` familiarity=[]#get_artist_familiarity() artist_hotness=[]#get_artist_hotttnesss song_hotness=[]# get_song_hotttnesss danceability=[]#get_danceability energy=[]#get_energy loudness=[]#get_loudness tempo=[]#get_tempo mode_confidence=[]#get_mode_confidence() time_sig_confidence=[]#get_time_signature_confidence() no_segments=[]#len(get_segments_start()) avg_segment_confidence=[]#np.mean(hdf5_getters.get_segments_confidence(h5)) avg_segment_pitches=[]#np.mean(hdf5_getters.get_segments_pitches(h51)) no_sections=[]#len(hdf5_getters.get_sections_start()) avg_sections_confidence=[]#np.mean(hdf5_getters.get_sections_confidence(h51)) no_beats_start=[]#len(hdf5_getters.get_beats_start(h51)) avg_beats_confidence=[]#np.mean(hdf5_getters.get_beats_confidence(h51)) no_bars=[]#len(hdf5_getters.get_bars_start(h5)) avg_bar_confidence=[]#np.mean(hdf5_getters.get_bars_confidence((h51)) no_tatums_start=[]#len(hdf5_getters.get_tatums_start(h5)) avg_tatums_start=[]#np.mean(get_tatums_confidence()) billboard_presence=[]#returned value from web scraper key=[] duration=[] mode=[] target=pd.read_csv('Billboard.csv') j=0 if(not(os.path.isfile('./data.csv'))): for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files : h5 = hdf5_getters.open_h5_file_read(f) songnme=hdf5_getters.get_title(h5) artst=hdf5_getters.get_artist_name(h5) for i in range(len(target)): if(target.Title[i]==songnme and target.Name[i]==artst): billboard_presence.append(target.Presence[i]) title.append(songnme) name.append(artst) familiarity.append(hdf5_getters.get_artist_familiarity(h5)) artist_hotness.append(hdf5_getters.get_artist_hotttnesss(h5)) song_hotness.append(hdf5_getters. get_song_hotttnesss(h5)) danceability.append(hdf5_getters.get_danceability(h5)) key.append(hdf5_getters.get_key(h5)) duration.append(hdf5_getters.get_duration(h5)) mode.append(hdf5_getters.get_mode(h5)) energy.append(hdf5_getters.get_energy(h5)) loudness.append(hdf5_getters.get_loudness(h5)) tempo.append(hdf5_getters.get_tempo(h5)) mode_confidence.append(hdf5_getters.get_mode_confidence(h5)) time_sig_confidence.append(hdf5_getters.get_time_signature_confidence(h5)) no_segments.append(len(hdf5_getters.get_segments_start(h5))) avg_segment_confidence.append(np.mean(hdf5_getters.get_segments_confidence(h5))) avg_segment_pitches.append(np.mean(hdf5_getters.get_segments_pitches(h5))) no_sections.append(len(hdf5_getters.get_sections_start(h5))) avg_sections_confidence.append(np.mean(hdf5_getters.get_sections_confidence(h5))) no_beats_start.append(len(hdf5_getters.get_beats_start(h5))) avg_beats_confidence.append(np.mean(hdf5_getters.get_beats_confidence(h5))) no_bars.append(len(hdf5_getters.get_bars_start(h5))) avg_bar_confidence.append(np.mean(hdf5_getters.get_bars_confidence(h5))) no_tatums_start.append(len(hdf5_getters.get_tatums_start(h5))) avg_tatums_start.append(np.mean(hdf5_getters.get_tatums_confidence(h5))) j+=1 print j #prints the index number of each song, to keep track of the song being saved to the database, and to identify errors. break; h5.close() print "Created Arrays" df=pd.DataFrame(title,columns=['Title']) df['Artist_Name']=name df['Familiarity']=familiarity df['Hotness']=artist_hotness df['Song_hotness']=song_hotness df['Danceability']=danceability df['energy']=energy df['loudness']=loudness df['tempo']=tempo df['mode_confidence']=mode_confidence df['time_sig_confidence']=time_sig_confidence df['no_segments']=no_segments df['avg_segment_confidence']=avg_segment_confidence df['avg_segment_pitches']=avg_segment_pitches df['no_sections']=no_sections df['avg_sections_confidence']=avg_sections_confidence df['no_beats_start']=no_beats_start df['avg_beats_confidence']=avg_beats_confidence df['no_bars']=no_bars df['avg_bar_confidence']=avg_bar_confidence df['no_tatums_start']=no_tatums_start df['avg_tatums_start']=avg_tatums_start df['key']=key df['Mode']=mode df['duration']=duration df['Presence']=billboard_presence print df.head() print billboard_presence df.to_csv("data.csv") else: df=pd.read_csv('data.csv',index_col=0) print "Number of features in the created dataset:", print len(df.keys()) print return df
def data_to_flat_file(basedir, ext='.h5'): """ This function extracts the information from the tables and creates the flat file. """ count = 0 #song counter list_to_write = [] group_index = 0 row_to_write = "" writer = csv.writer(open("complete.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: row = [] print f h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title = title.replace('"', '') row.append(title) comma = title.find(',') if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album = album.replace('"', '') row.append(album) comma = album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma = artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name = artist_name.replace('"', '') row.append(artist_name) duration = hdf5_getters.get_duration(h5) row.append(duration) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) row.append(samp_rt) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) row.append(artist_7digitalid) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam = -1 row.append(artist_fam) artist_hotness = hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness = -1 row.append(artist_hotness) artist_id = hdf5_getters.get_artist_id(h5) row.append(artist_id) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat = -1 row.append(artist_lat) artist_loc = hdf5_getters.get_artist_location(h5) row.append(artist_loc) artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon = -1 row.append(artist_lon) artist_mbid = hdf5_getters.get_artist_mbid(h5) row.append(artist_mbid) #Getting the genre art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes = get_genre_indexes( trm_freq) #index of the highest freq genre_set = 0 #flag to see if the genre has been set or not final_genre = [] genres_so_far = [] for i in range(len(genre_indexes)): genre_tmp = get_genre( art_trm, genre_indexes[i] ) #genre that corresponds to the highest freq genres_so_far = genre_dict.get_genre_in_dict( genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set = 1 if genre_set == 1: col_num = [] for i in final_genre: column = int(i) #getting the column number of the genre col_num.append(column) genre_array = genre_columns(col_num) #genre array for i in range(len( genre_array)): #appending the genre_array to the row row.append(genre_array[i]) else: genre_array = genre_columns( -1 ) #when there is no genre matched, return an array of [0...0] for i in range(len( genre_array)): #appending the genre_array to the row row.append(genre_array[i]) artist_pmid = hdf5_getters.get_artist_playmeid(h5) row.append(artist_pmid) audio_md5 = hdf5_getters.get_audio_md5(h5) row.append(audio_md5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability = -1 row.append(danceability) end_fade_in = hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in = -1 row.append(end_fade_in) energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy = -1 row.append(energy) song_key = hdf5_getters.get_key(h5) row.append(song_key) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c = -1 row.append(key_c) loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness = -1 row.append(loudness) mode = hdf5_getters.get_mode(h5) row.append(mode) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf = -1 row.append(mode_conf) release_7digitalid = hdf5_getters.get_release_7digitalid(h5) row.append(release_7digitalid) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot = -1 row.append(song_hot) song_id = hdf5_getters.get_song_id(h5) row.append(song_id) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) row.append(start_fade_out) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo = -1 row.append(tempo) time_sig = hdf5_getters.get_time_signature(h5) row.append(time_sig) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c = -1 row.append(time_sig_c) track_id = hdf5_getters.get_track_id(h5) row.append(track_id) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) row.append(track_7digitalid) year = hdf5_getters.get_year(h5) row.append(year) bars_c = hdf5_getters.get_bars_confidence(h5) bars_start = hdf5_getters.get_bars_start(h5) row_bars_padding = padding( 245 ) #this is the array that will be attached at the end of th row #--------------bars---------------" gral_info = [] gral_info = row[:] empty = [] for i, item in enumerate(bars_c): row.append(group_index) row.append(i) row.append(bars_c[i]) bars_c_avg = get_avg(bars_c) row.append(bars_c_avg) bars_c_max = get_max(bars_c) row.append(bars_c_max) bars_c_min = get_min(bars_c) row.append(bars_c_min) bars_c_stddev = get_stddev(bars_c) row.append(bars_c_stddev) bars_c_count = get_count(bars_c) row.append(bars_c_count) bars_c_sum = get_sum(bars_c) row.append(bars_c_sum) row.append(bars_start[i]) bars_start_avg = get_avg(bars_start) row.append(bars_start_avg) bars_start_max = get_max(bars_start) row.append(bars_start_max) bars_start_min = get_min(bars_start) row.append(bars_start_min) bars_start_stddev = get_stddev(bars_start) row.append(bars_start_stddev) bars_start_count = get_count(bars_start) row.append(bars_start_count) bars_start_sum = get_sum(bars_start) row.append(bars_start_sum) for i in row_bars_padding: row.append(i) writer.writerow(row) row = [] row = gral_info[:] #--------beats---------------" beats_c = hdf5_getters.get_beats_confidence(h5) group_index = 1 row = [] row = gral_info[:] row_front = padding( 14) #blanks left in front of the row(empty spaces for bars) row_beats_padding = padding(231) for i, item in enumerate(beats_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the beats row.append(index) row.append(beats_c[i]) beats_c_avg = get_avg(beats_c) row.append(beats_c_avg) beats_c_max = get_max(beats_c) row.append(beats_c_max) beats_c_min = get_min(beats_c) row.append(beats_c_min) beats_c_stddev = get_stddev(beats_c) row.append(beats_c_stddev) beats_c_count = get_count(beats_c) row.append(beats_c_count) beats_c_sum = get_sum(beats_c) row.append(beats_c_sum) beats_start = hdf5_getters.get_beats_start(h5) row.append(beats_start[i]) beats_start_avg = get_avg(beats_start) row.append(beats_start_avg) beats_start_max = get_max(beats_start) row.append(beats_start_max) beats_start_min = get_min(beats_start) row.append(beats_start_min) beats_start_stddev = get_stddev(beats_start) row.append(beats_start_stddev) beats_start_count = get_count(beats_start) row.append(beats_start_count) beats_start_sum = get_sum(beats_start) row.append(beats_start_sum) for i in row_beats_padding: row.append(i) writer.writerow(row) row = [] row = gral_info[:] # "--------sections---------------" row_sec_padding = padding( 217) #blank spaces left at the end of the row sec_c = hdf5_getters.get_sections_confidence(h5) group_index = 2 row = [] row = gral_info[:] row_front = padding( 28) #blank spaces left in front(empty spaces for bars,beats) for i, item in enumerate(sec_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the sections row.append(index) row.append(sec_c[i]) sec_c_avg = get_avg(sec_c) row.append(sec_c_avg) sec_c_max = get_max(sec_c) row.append(sec_c_max) sec_c_min = get_min(sec_c) row.append(sec_c_min) sec_c_stddev = get_stddev(sec_c) row.append(sec_c_stddev) sec_c_count = get_count(sec_c) row.append(sec_c_count) sec_c_sum = get_sum(sec_c) row.append(sec_c_sum) sec_start = hdf5_getters.get_sections_start(h5) row.append(sec_start[i]) sec_start_avg = get_avg(sec_start) row.append(sec_start_avg) sec_start_max = get_max(sec_start) row.append(sec_start_max) sec_start_min = get_min(sec_start) row.append(sec_start_min) sec_start_stddev = get_stddev(sec_start) row.append(sec_start_stddev) sec_start_count = get_count(sec_start) row.append(sec_start_count) sec_start_sum = get_sum(sec_start) row.append(sec_start_sum) for i in row_sec_padding: #appending the blank spaces at the end of the row row.append(i) writer.writerow(row) row = [] row = gral_info[:] #--------segments---------------" row_seg_padding = padding(182) #blank spaces at the end of the row row_front = padding(42) #blank spaces left in front of segments seg_c = hdf5_getters.get_segments_confidence(h5) group_index = 3 row = [] row = gral_info[:] for i, item in enumerate(seg_c): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of the segments row.append(index) row.append(seg_c[i]) seg_c_avg = get_avg(seg_c) row.append(seg_c_avg) seg_c_max = get_max(seg_c) row.append(seg_c_max) seg_c_min = get_min(seg_c) row.append(seg_c_min) seg_c_stddev = get_stddev(seg_c) row.append(seg_c_stddev) seg_c_count = get_count(seg_c) row.append(seg_c_count) seg_c_sum = get_sum(seg_c) row.append(seg_c_sum) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) row.append(seg_loud_max[i]) seg_loud_max_avg = get_avg(seg_loud_max) row.append(seg_loud_max_avg) seg_loud_max_max = get_max(seg_loud_max) row.append(seg_loud_max_max) seg_loud_max_min = get_min(seg_loud_max) row.append(seg_loud_max_min) seg_loud_max_stddev = get_stddev(seg_loud_max) row.append(seg_loud_max_stddev) seg_loud_max_count = get_count(seg_loud_max) row.append(seg_loud_max_count) seg_loud_max_sum = get_sum(seg_loud_max) row.append(seg_loud_max_sum) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time( h5) row.append(seg_loud_max_time[i]) seg_loud_max_time_avg = get_avg(seg_loud_max_time) row.append(seg_loud_max_time_avg) seg_loud_max_time_max = get_max(seg_loud_max_time) row.append(seg_loud_max_time_max) seg_loud_max_time_min = get_min(seg_loud_max_time) row.append(seg_loud_max_time_min) seg_loud_max_time_stddev = get_stddev(seg_loud_max_time) row.append(seg_loud_max_time_stddev) seg_loud_max_time_count = get_count(seg_loud_max_time) row.append(seg_loud_max_time_count) seg_loud_max_time_sum = get_sum(seg_loud_max_time) row.append(seg_loud_max_time_sum) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) row.append(seg_loud_start[i]) seg_loud_start_avg = get_avg(seg_loud_start) row.append(seg_loud_start_avg) seg_loud_start_max = get_max(seg_loud_start) row.append(seg_loud_start_max) seg_loud_start_min = get_min(seg_loud_start) row.append(seg_loud_start_min) seg_loud_start_stddev = get_stddev(seg_loud_start) row.append(seg_loud_start_stddev) seg_loud_start_count = get_count(seg_loud_start) row.append(seg_loud_start_count) seg_loud_start_sum = get_sum(seg_loud_start) row.append(seg_loud_start_sum) seg_start = hdf5_getters.get_segments_start(h5) row.append(seg_start[i]) seg_start_avg = get_avg(seg_start) row.append(seg_start_avg) seg_start_max = get_max(seg_start) row.append(seg_start_max) seg_start_min = get_min(seg_start) row.append(seg_start_min) seg_start_stddev = get_stddev(seg_start) row.append(seg_start_stddev) seg_start_count = get_count(seg_start) row.append(seg_start_count) seg_start_sum = get_sum(seg_start) row.append(seg_start_sum) for i in row_seg_padding: #appending blank spaces at the end of the row row.append(i) writer.writerow(row) row = [] row = gral_info[:] #----------segments pitch and timbre---------------" row_seg2_padding = padding( 14) #blank spaces left at the end of the row row_front = padding( 77) #blank spaces left at the front of the segments and timbre seg_pitch = hdf5_getters.get_segments_pitches(h5) transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows group_index = 4 row = [] row = gral_info[:] for i, item in enumerate(transpose_pitch[0]): row.append(group_index) row.append(i) for index in row_front: #padding blanks in front of segments and timbre row.append(index) row.append(transpose_pitch[0][i]) seg_pitch_avg = get_avg(transpose_pitch[0]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[0]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[0]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[0]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[0]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[0]) row.append(seg_pitch_sum) row.append(transpose_pitch[1][i]) seg_pitch_avg = get_avg(transpose_pitch[1]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[1]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[1]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[1]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[1]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[1]) row.append(seg_pitch_sum) row.append(transpose_pitch[2][i]) seg_pitch_avg = get_avg(transpose_pitch[2]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[2]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[2]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[2]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[2]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[2]) row.append(seg_pitch_sum) row.append(transpose_pitch[3][i]) seg_pitch_avg = get_avg(transpose_pitch[3]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[3]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[3]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[3]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[3]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[3]) row.append(seg_pitch_sum) row.append(transpose_pitch[4][i]) seg_pitch_avg = get_avg(transpose_pitch[4]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[4]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[4]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[4]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[4]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[4]) row.append(seg_pitch_sum) row.append(transpose_pitch[5][i]) seg_pitch_avg = get_avg(transpose_pitch[5]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[5]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[5]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[5]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[5]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[5]) row.append(seg_pitch_sum) row.append(transpose_pitch[6][i]) seg_pitch_avg = get_avg(transpose_pitch[6]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[6]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[6]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[6]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[6]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[6]) row.append(seg_pitch_sum) row.append(transpose_pitch[7][i]) seg_pitch_avg = get_avg(transpose_pitch[7]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[7]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[7]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[7]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[7]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[7]) row.append(seg_pitch_sum) row.append(transpose_pitch[8][i]) seg_pitch_avg = get_avg(transpose_pitch[8]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[8]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[8]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[8]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[8]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[8]) row.append(seg_pitch_sum) row.append(transpose_pitch[9][i]) seg_pitch_avg = get_avg(transpose_pitch[9]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[9]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[9]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[9]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[9]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[9]) row.append(seg_pitch_sum) row.append(transpose_pitch[10][i]) seg_pitch_avg = get_avg(transpose_pitch[10]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[10]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[10]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[10]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[10]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[10]) row.append(seg_pitch_sum) row.append(transpose_pitch[11][i]) seg_pitch_avg = get_avg(transpose_pitch[11]) row.append(seg_pitch_avg) seg_pitch_max = get_max(transpose_pitch[11]) row.append(seg_pitch_max) seg_pitch_min = get_min(transpose_pitch[11]) row.append(seg_pitch_min) seg_pitch_stddev = get_stddev(transpose_pitch[11]) row.append(seg_pitch_stddev) seg_pitch_count = get_count(transpose_pitch[11]) row.append(seg_pitch_count) seg_pitch_sum = get_sum(transpose_pitch[11]) row.append(seg_pitch_sum) #timbre arrays seg_timbre = hdf5_getters.get_segments_timbre(h5) transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows row.append(transpose_timbre[0][i]) seg_timbre_avg = get_avg(transpose_timbre[0]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[0]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[0]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[0]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[0]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[0]) row.append(seg_timbre_sum) row.append(transpose_timbre[1][i]) seg_timbre_avg = get_avg(transpose_timbre[1]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[1]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[1]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[1]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[1]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[1]) row.append(seg_timbre_sum) row.append(transpose_timbre[2][i]) seg_timbre_avg = get_avg(transpose_timbre[2]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[2]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[2]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[2]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[2]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[2]) row.append(seg_timbre_sum) row.append(transpose_timbre[3][i]) seg_timbre_avg = get_avg(transpose_timbre[3]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[3]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[3]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[3]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[3]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[3]) row.append(seg_timbre_sum) row.append(transpose_timbre[4][i]) seg_timbre_avg = get_avg(transpose_timbre[4]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[4]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[4]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[4]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[4]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[4]) row.append(seg_timbre_sum) row.append(transpose_timbre[5][i]) seg_timbre_avg = get_avg(transpose_timbre[5]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[5]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[5]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[5]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[5]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[5]) row.append(seg_timbre_sum) row.append(transpose_timbre[6][i]) seg_timbre_avg = get_avg(transpose_timbre[6]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[6]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[6]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[6]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[6]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[6]) row.append(seg_timbre_sum) row.append(transpose_timbre[7][i]) seg_timbre_avg = get_avg(transpose_timbre[7]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[7]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[7]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[7]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[7]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[7]) row.append(seg_timbre_sum) row.append(transpose_timbre[8][i]) seg_timbre_avg = get_avg(transpose_timbre[8]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[8]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[8]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[8]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[8]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[8]) row.append(seg_timbre_sum) row.append(transpose_timbre[9][i]) seg_timbre_avg = get_avg(transpose_timbre[9]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[9]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[9]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[9]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[9]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[9]) row.append(seg_timbre_sum) row.append(transpose_timbre[10][i]) seg_timbre_avg = get_avg(transpose_timbre[10]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[10]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[10]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[10]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[10]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[10]) row.append(seg_timbre_sum) row.append(transpose_timbre[11][i]) seg_timbre_avg = get_avg(transpose_timbre[11]) row.append(seg_timbre_avg) seg_timbre_max = get_max(transpose_timbre[11]) row.append(seg_timbre_max) seg_timbre_min = get_min(transpose_timbre[11]) row.append(seg_timbre_min) seg_timbre_stddev = get_stddev(transpose_timbre[11]) row.append(seg_timbre_stddev) seg_timbre_count = get_count(transpose_timbre[11]) row.append(seg_timbre_count) seg_timbre_sum = get_sum(transpose_timbre[11]) row.append(seg_timbre_sum) for item in row_seg2_padding: row.append(item) writer.writerow(row) row = [] row = gral_info[:] # "--------tatums---------------" tatms_c = hdf5_getters.get_tatums_confidence(h5) group_index = 5 row_front = padding(245) #blank spaces left in front of tatums row = [] row = gral_info[:] for i, item in enumerate(tatms_c): row.append(group_index) row.append(i) for item in row_front: #appending blank spaces at the front of the row row.append(item) row.append(tatms_c[i]) tatms_c_avg = get_avg(tatms_c) row.append(tatms_c_avg) tatms_c_max = get_max(tatms_c) row.append(tatms_c_max) tatms_c_min = get_min(tatms_c) row.append(tatms_c_min) tatms_c_stddev = get_stddev(tatms_c) row.append(tatms_c_stddev) tatms_c_count = get_count(tatms_c) row.append(tatms_c_count) tatms_c_sum = get_sum(tatms_c) row.append(tatms_c_sum) tatms_start = hdf5_getters.get_tatums_start(h5) row.append(tatms_start[i]) tatms_start_avg = get_avg(tatms_start) row.append(tatms_start_avg) tatms_start_max = get_max(tatms_start) row.append(tatms_start_max) tatms_start_min = get_min(tatms_start) row.append(tatms_start_min) tatms_start_stddev = get_stddev(tatms_start) row.append(tatms_start_stddev) tatms_start_count = get_count(tatms_start) row.append(tatms_start_count) tatms_start_sum = get_sum(tatms_start) row.append(tatms_start_sum) writer.writerow(row) row = [] row = gral_info[:] transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg = [] seg_pitch_max = [] seg_pitch_min = [] seg_pitch_stddev = [] seg_pitch_count = [] seg_pitch_sum = [] i = 0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i = i + 1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg = [] seg_timbre_max = [] seg_timbre_min = [] seg_timbre_stddev = [] seg_timbre_count = [] seg_timbre_sum = [] i = 0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i = i + 1 h5.close() count = count + 1 print count
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,duration,samp_rt,artist_7digitalid,artist_fam,artist_hotness,artist_id,artist_lat,artist_loc,artist_lon,artist_mbid,genre_array[0],genre_array[1],genre_array[2], genre_array[3],genre_array[4],genre_array[5],genre_array[6],genre_array[7],genre_array[8],genre_array[9],genre_array[10],genre_array[11],genre_array[12],genre_array[13],genre_array[14],genre_array[15], genre_array[16],genre_array[17],genre_array[18],genre_array[19],genre_array[20],genre_array[21],genre_array[22],genre_array[23],genre_array[24],genre_array[25],genre_array[26], genre_array[27],genre_array[28],genre_array[29],genre_array[30],genre_array[31],genre_array[32],genre_array[33],genre_array[34],genre_array[35],genre_array[36],genre_array[37],genre_array[38], genre_array[39],genre_array[40],genre_array[41],genre_array[42],genre_array[43],genre_array[44],genre_array[45],genre_array[46],genre_array[47],genre_array[48],genre_array[49], genre_array[50],genre_array[51],genre_array[52],genre_array[53],genre_array[54],genre_array[55],genre_array[56],genre_array[57],genre_array[58],genre_array[59], genre_array[60],genre_array[61],genre_array[62],genre_array[63],genre_array[64],genre_array[65],genre_array[66],genre_array[67],genre_array[68],genre_array[69], genre_array[70],genre_array[71],genre_array[72],genre_array[73],genre_array[74],genre_array[75],genre_array[76],genre_array[77],genre_array[78],genre_array[79], genre_array[80],genre_array[81],genre_array[82],genre_array[83],genre_array[84],genre_array[85],genre_array[86],genre_array[87],genre_array[88],genre_array[89], genre_array[90],genre_array[91],genre_array[92],genre_array[93],genre_array[94],genre_array[95],genre_array[96],genre_array[97],genre_array[98],genre_array[99],genre_array[100],genre_array[101], genre_array[102],genre_array[103],genre_array[104],genre_array[105],genre_array[106],genre_array[107],genre_array[108],genre_array[109],genre_array[110],genre_array[111],genre_array[112], genre_array[113],genre_array[114],genre_array[115],genre_array[116],genre_array[117],genre_array[118],genre_array[119],genre_array[120],genre_array[121],genre_array[122],genre_array[123], genre_array[124],genre_array[125],genre_array[126],genre_array[127],genre_array[128],genre_array[129],genre_array[130],genre_array[131],genre_array[132], artist_pmid,audio_md5,danceability,end_fade_in,energy,song_key,key_c,loudness,mode,mode_conf,release_7digitalid,song_hot,song_id,start_fade_out,tempo,time_sig,time_sig_c,track_id,track_7digitalid,year,bars_c_avg,bars_c_max,bars_c_min,bars_c_stddev,bars_c_count,bars_c_sum,bars_start_avg,bars_start_max,bars_start_min,bars_start_stddev,bars_start_count,bars_start_sum,beats_c_avg,beats_c_max,beats_c_min,beats_c_stddev,beats_c_count,beats_c_sum,beats_start_avg,beats_start_max,beats_start_min, beats_start_stddev,beats_start_count,beats_start_sum, sec_c_avg,sec_c_max,sec_c_min,sec_c_stddev,sec_c_count,sec_c_sum,sec_start_avg,sec_start_max,sec_start_min,sec_start_stddev,sec_start_count,sec_start_sum,seg_c_avg,seg_c_max,seg_c_min,seg_c_stddev,seg_c_count,seg_c_sum,seg_loud_max_avg,seg_loud_max_max,seg_loud_max_min,seg_loud_max_stddev,seg_loud_max_count,seg_loud_max_sum,seg_loud_max_time_avg,seg_loud_max_time_max,seg_loud_max_time_min,seg_loud_max_time_stddev,seg_loud_max_time_count,seg_loud_max_time_sum,seg_loud_start_avg,seg_loud_start_max,seg_loud_start_min,seg_loud_start_stddev,seg_loud_start_count,seg_loud_start_sum,seg_pitch_avg[0],seg_pitch_max[0],seg_pitch_min[0],seg_pitch_stddev[0],seg_pitch_count[0],seg_pitch_sum[0],seg_pitch_avg[1],seg_pitch_max[1],seg_pitch_min[1],seg_pitch_stddev[1],seg_pitch_count[1],seg_pitch_sum[1],seg_pitch_avg[2],seg_pitch_max[2],seg_pitch_min[2],seg_pitch_stddev[2],seg_pitch_count[2],seg_pitch_sum[2],seg_pitch_avg[3],seg_pitch_max[3],seg_pitch_min[3],seg_pitch_stddev[3],seg_pitch_count[3],seg_pitch_sum[3],seg_pitch_avg[4],seg_pitch_max[4],seg_pitch_min[4],seg_pitch_stddev[4],seg_pitch_count[4],seg_pitch_sum[4],seg_pitch_avg[5],seg_pitch_max[5],seg_pitch_min[5],seg_pitch_stddev[5],seg_pitch_count[5],seg_pitch_sum[5],seg_pitch_avg[6],seg_pitch_max[6],seg_pitch_min[6],seg_pitch_stddev[6],seg_pitch_count[6],seg_pitch_sum[6],seg_pitch_avg[7],seg_pitch_max[7],seg_pitch_min[7],seg_pitch_stddev[7],seg_pitch_count[7],seg_pitch_sum[7],seg_pitch_avg[8],seg_pitch_max[8],seg_pitch_min[8],seg_pitch_stddev[8],seg_pitch_count[8],seg_pitch_sum[8],seg_pitch_avg[9],seg_pitch_max[9],seg_pitch_min[9],seg_pitch_stddev[9],seg_pitch_count[9],seg_pitch_sum[9],seg_pitch_avg[10],seg_pitch_max[10],seg_pitch_min[10],seg_pitch_stddev[10],seg_pitch_count[10],seg_pitch_sum[10],seg_pitch_avg[11],seg_pitch_max[11],seg_pitch_min[11], seg_pitch_stddev[11],seg_pitch_count[11],seg_pitch_sum[11],seg_start_avg,seg_start_max,seg_start_min,seg_start_stddev, seg_start_count,seg_start_sum,seg_timbre_avg[0],seg_timbre_max[0],seg_timbre_min[0],seg_timbre_stddev[0],seg_timbre_count[0], seg_timbre_sum[0],seg_timbre_avg[1],seg_timbre_max[1],seg_timbre_min[1],seg_timbre_stddev[1],seg_timbre_count[1], seg_timbre_sum[1],seg_timbre_avg[2],seg_timbre_max[2],seg_timbre_min[2],seg_timbre_stddev[2],seg_timbre_count[2], seg_timbre_sum[2],seg_timbre_avg[3],seg_timbre_max[3],seg_timbre_min[3],seg_timbre_stddev[3],seg_timbre_count[3], seg_timbre_sum[3],seg_timbre_avg[4],seg_timbre_max[4],seg_timbre_min[4],seg_timbre_stddev[4],seg_timbre_count[4], seg_timbre_sum[4],seg_timbre_avg[5],seg_timbre_max[5],seg_timbre_min[5],seg_timbre_stddev[5],seg_timbre_count[5], seg_timbre_sum[5],seg_timbre_avg[6],seg_timbre_max[6],seg_timbre_min[6],seg_timbre_stddev[6],seg_timbre_count[6], seg_timbre_sum[6],seg_timbre_avg[7],seg_timbre_max[7],seg_timbre_min[7],seg_timbre_stddev[7],seg_timbre_count[7], seg_timbre_sum[7],seg_timbre_avg[8],seg_timbre_max[8],seg_timbre_min[8],seg_timbre_stddev[8],seg_timbre_count[8], seg_timbre_sum[8],seg_timbre_avg[9],seg_timbre_max[9],seg_timbre_min[9],seg_timbre_stddev[9],seg_timbre_count[9], seg_timbre_sum[9],seg_timbre_avg[10],seg_timbre_max[10],seg_timbre_min[10],seg_timbre_stddev[10],seg_timbre_count[10], seg_timbre_sum[10],seg_timbre_avg[11],seg_timbre_max[11],seg_timbre_min[11],seg_timbre_stddev[11],seg_timbre_count[11], seg_timbre_sum[11],tatms_c_avg,tatms_c_max,tatms_c_min,tatms_c_stddev,tatms_c_count,tatms_c_sum,tatms_start_avg,tatms_start_max,tatms_start_min,tatms_start_stddev,tatms_start_count,tatms_start_sum]) h5.close() count=count+1; print count;
def main(): outputFile1 = open('SongCSV.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input("\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"+ " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString); csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ("SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,"+ "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,"+ "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"+ "Title,Year") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() outputFile1.write("SongNumber,"); outputFile1.write(csvRowString + "\n"); csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "." # "." As the default means the current directory ext = ".H5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumName = str(hdf5_getters.get_release(songH5File)) song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str(hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignatureConfidence = str(hdf5_getters.get_key_confidence(songH5File)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str(hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str(hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) #print song count csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',',"") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',','') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
for f in fs: counter += 1 if (counter % 1000) == 0: print "Progress: {0}".format(counter) if (counter % 10000) == 0: outputFile1.close() file_counter += 1 outputFile1 = open( os.path.join(output_dir, base_file_name).format(file_counter), 'w') writeheader(outputFile1, csvHeaderString, ';') songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.artistName = remove_trap_characters( str(hdf5_getters.get_artist_name(songH5File))) song.artistID = remove_trap_characters( str(hdf5_getters.get_artist_id(songH5File))) song.albumID = remove_trap_characters( str(hdf5_getters.get_release_7digitalid(songH5File))) song.artistLatitude = remove_trap_characters( str(hdf5_getters.get_artist_latitude(songH5File))) # Replace the comma in the location (if there is one), since this will displace the entire row song.artistLocation = remove_trap_characters( str(hdf5_getters.get_artist_location(songH5File))).replace( ',', ':')
def get_all_rows(basedir, ext='.h5'): rows = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: # print(os.path.join(root, f)) h5 = hdf5_getters.open_h5_file_read(f) num_songs = hdf5_getters.get_num_songs(h5) # print(num_songs) for i in range(num_songs): print(i) obj = {} obj['artist_name'] = hdf5_getters.get_artist_name( h5, i).decode('UTF-8') obj['artist_familiarity'] = hdf5_getters.get_artist_familiarity( h5, i) obj['artist_hotness'] = hdf5_getters.get_artist_hotttnesss( h5, i) obj['artist_id'] = hdf5_getters.get_artist_id( h5, i).decode('UTF-8') # obj['artist_mbid']=hdf5_getters.get_artist_mbid(h5,i).decode('UTF-8') obj['artist_playmeid'] = hdf5_getters.get_artist_playmeid( h5, i) obj['artist_7digitalid'] = hdf5_getters.get_artist_7digitalid( h5, i) # obj['artist_latitude']=hdf5_getters.get_artist_latitude(h5,i) # obj['artist_longitude']=hdf5_getters.get_artist_longitude(h5,i) # obj['artist_location']=hdf5_getters.get_artist_location(h5,i).decode('UTF-8') obj['artist_name'] = hdf5_getters.get_artist_name( h5, i).decode('UTF-8') obj['release'] = hdf5_getters.get_release(h5, i).decode('UTF-8') obj['song_hotttnesss'] = hdf5_getters.get_song_hotttnesss( h5, i) obj['title'] = hdf5_getters.get_title(h5, i).decode('UTF-8') # obj['artist_terms']=hdf5_getters.get_artist_terms(h5) # obj['artist_terms_freq']=hdf5_getters.get_artist_terms_freq(h5) # obj['artist_terms_weight']=hdf5_getters.get_artist_terms_weight(h5) # obj['audio_md5']=hdf5_getters.get_audio_md5(h5).decode('UTF-8') obj['danceability'] = hdf5_getters.get_danceability(h5, i) obj['duration'] = hdf5_getters.get_duration(h5, i) obj['end_of_fade_in'] = hdf5_getters.get_end_of_fade_in(h5, i) obj['energy'] = hdf5_getters.get_energy(h5, i) obj['key'] = hdf5_getters.get_key(h5, i) obj['key_confidence'] = hdf5_getters.get_key_confidence(h5, i) obj['loudness'] = hdf5_getters.get_loudness(h5, i) obj['mode'] = hdf5_getters.get_mode(h5, i) # obj['start_of_fade_out']=hdf5_getters.get_start_of_fade_out(h5) obj['tempo'] = hdf5_getters.get_tempo(h5, i) obj['time_signature'] = hdf5_getters.get_time_signature(h5, i) # obj['time_signature_confidence']=hdf5_getters.get_time_signature_confidence(h5) obj['track_id'] = hdf5_getters.get_track_id(h5, i).decode('UTF-8') # obj['segments_start']=hdf5_getters.get_segments_start(h5) # obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5) # obj['segments_pitches']=hdf5_getters.get_segments_pitches(h5) # obj['segments_timbre']=hdf5_getters.get_segments_timbre(h5) # obj['segments_loudness_max']=hdf5_getters.get_segments_loudness_max(h5) # obj['segments_loudness_max_time']=hdf5_getters.get_segments_loudness_max_time(h5) # obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5) # obj['segments_loudness_start']=hdf5_getters.get_segments_loudness_start(h5) # obj['sections_start']=hdf5_getters.get_sections_start(h5) # obj['sections_confidence']=hdf5_getters.get_sections_confidence(h5) # obj['beats_start']=hdf5_getters.get_beats_start(h5) # obj['beats_confidence']=hdf5_getters.get_beats_confidence(h5) # obj['bars_start']=hdf5_getters.get_bars_start(h5) # obj['bars_confidence']=hdf5_getters.get_bars_confidence(h5) # obj['tatums_start']=hdf5_getters.get_tatums_start(h5) # obj['artist_mbtags']=hdf5_getters.get_artist_mbtags(h5) # obj['artist_mbtags_count']=hdf5_getters.get_artist_mbtags_count(h5) obj['year'] = hdf5_getters.get_year(h5, i) rows.append(obj) h5.close() return rows
def main(): basedir = "D:/Master K" ext = ".H5" # Set the extension here. H5 is the extension for HDF5 files. songs = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) #songs = {} #keys = list() #values = list() for f in files: print(f) songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) item = {"song_id": song.id.replace('b', '')} song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.artistID = song.artistID.replace('b', '', 1) item["song_artistID"] = song.artistID song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) song.albumID = song.albumID.replace('b', '', 1) item["song_albumID"] = song.albumID song.albumName = str(hdf5_getters.get_release(songH5File)) song.albumName = song.albumName.replace('b', '', 1) item["song_albumName"] = song.albumName song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File)) song.artistLatitude = song.artistLatitude.replace('b', '', 1) item["song_artistLatitude"] = song.artistLatitude song.artistLocation = str( hdf5_getters.get_artist_location(songH5File)) song.artistLocation = song.artistLocation.replace('b', '', 1) item["song_artistLocation"] = song.artistLocation song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File)) song.artistLongitude = song.artistLongitude.replace('b', '', 1) item["song_artistLongitude"] = song.artistLongitude song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.artistName = song.artistName.replace('b', '', 1) item["song_artistName"] = song.artistName song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.danceability = song.danceability.replace('b', '', 1) item["song_danceability"] = song.danceability song.duration = str(hdf5_getters.get_duration(songH5File)) song.duration = song.duration.replace('b', '', 1) item["song_duration"] = song.duration song.keySignature = str(hdf5_getters.get_key(songH5File)) song.keySignature = song.keySignature.replace('b', '', 1) item["song_keySignature"] = song.keySignature song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) song.keySignatureConfidence = song.keySignatureConfidence.replace( 'b', '', 1) item["song_keySignatureConfidence"] = song.keySignatureConfidence song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.tempo = song.tempo.replace('b', '', 1) item["song_tempo"] = song.tempo song.timeSignature = str( hdf5_getters.get_time_signature(songH5File)) song.timeSignature = song.timeSignature.replace('b', '', 1) item["song_timeSignature"] = song.timeSignature song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.timeSignatureConfidence = song.timeSignatureConfidence.replace( 'b', '', 1) item["song_timeSignatureConfidence"] = song.timeSignatureConfidence song.title = str(hdf5_getters.get_title(songH5File)) song.title = song.title.replace('b', '', 1) item["song_title"] = song.title song.year = str(hdf5_getters.get_year(songH5File)) song.year = song.year.replace('b', '', 1) item["song_year"] = song.year #song.mfcc = str(hdf5_getters.get_segments_timbre(songH5File)) #item["song_mfcc"] = song.mfcc item["song_mfcc"] = list( hdf5_getters.get_segments_timbre(songH5File)) song.hotness = str(hdf5_getters.get_artist_hotttnesss(songH5File)) item["song_hotness"] = song.hotness songs.append(item) songH5File.close() #song_dict= dict(zip(keys, values)) with open("D:\data_file_k.json", "w") as write_file: json.dump(songs, write_file, cls=NumpyEncoder)
def get_danceability(self): if self.h5 == None: self.open() return hdf5_getters.get_danceability(self.h5)
track_title = track_title.replace("'","") track_album = h.get_release(h5,0) track_album = track_album.replace("'","") track_duration = str(h.get_duration(h5,0)) track_year = str(h.get_year(h5,0)) cursor.execute("SELECT * FROM track WHERE track_id = '" + track_id + "'") rs = cursor.fetchall() if cursor.rowcount != 1: cursor.execute("INSERT INTO track VALUES ('" + track_id + "','" + track_title + "','" + artist_id + "','" + artist_name + "','" + track_album + "'," + track_duration + "," + track_year + ");") ''' Store track_analysis tuples ''' print ("Track ID: " + h.get_track_id(h5,0)) track_tempo = str(h.get_tempo(h5,0)) track_key = str(h.get_key(h5,0)) track_danceability = str(h.get_danceability(h5,0)) if track_danceability == "nan": track_danceability = "0.0" track_hottness = str(h.get_song_hotttnesss(h5,0)) if track_hottness == "nan": track_hottness = "0.0" cursor.execute("SELECT * FROM track_analysis WHERE track_id = '" + track_id + "'") rs = cursor.fetchall() if cursor.rowcount != 1: cursor.execute("INSERT INTO track_analysis VALUES ('" + track_id + "'," + track_tempo + "," + track_key + "," + track_danceability + "," + track_hottness + ");") h5.close() db.commit()
def main(argv): if len(argv) != 1: print "Specify data directory" return basedir = argv[0] outputFile1 = open('SongCSV.csv', 'w') outputFile2 = open('TagsCSV.csv', 'w') csvRowString = "" csvLabelString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input("\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"+ " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString); csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) #csvRowString = ("SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,"+ # "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,"+ # "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"+ # "Title,Year") csvRowString = ("ArtistFamiliarity,ArtistHotttnesss,"+ "BarsConfidence,BarsStart,BeatsConfidence,BeatsStart,Duration,"+ "EndOfFadeIn,Key,KeyConfidence,Loudness,Mode,ModeConfidence,"+ "SectionsConfidence,SectionsStart,SegmentsConfidence,SegmentsLoudnessMax,"+ "SegmentsLoudnessMaxTime,SegmentsLoudnessStart,SegmentsStart,"+ "SongHotttnesss,StartOfFadeOut,TatumsConfidence,TatumsStart,Tempo,TimeSignature,TimeSignatureConfidence,"+ "SegmentsPitches,SegmentsTimbre,Title,Year,Decade,ArtistMbtags") ################################################# header = str() csvAttributeList = re.split('\W+', csvRowString) arrayAttributes = ["BarsConfidence","BarsStart","BeatsConfidence","BeatsStart", "SectionsConfidence","SectionsStart","SegmentsConfidence","SegmentsLoudnessMax", "SegmentsLoudnessMaxTime","SegmentsLoudnessStart","SegmentsStart", "TatumsConfidence","TatumsStart"] for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() if(v=="SegmentsPitches"): for i in range(90): header = header + "SegmentsPitches" + str(i) + "," elif(v=="SegmentsTimbre"): for i in range(90): header = header + "SegmentsTimbre" + str(i) + "," elif(v in arrayAttributes): header = header + v + str(0) + "," header = header + v + str(1) + "," else: header = header + v + "," outputFile1.write("SongNumber,"); #outputFile1.write(csvRowString + "\n"); outputFile1.write(header + "\n"); csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate #basedir = "MillionSongSubset/data/A/A/" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP all = sorted(os.walk(basedir)) for root, dirs, files in all: files = sorted(glob.glob(os.path.join(root,'*'+ext))) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) song = Song(str(hdf5_getters.get_song_id(songH5File))) #testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) song.analysisSampleRate = str(hdf5_getters.get_analysis_sample_rate(songH5File)) song.artistFamiliarity = str(hdf5_getters.get_artist_familiarity(songH5File)) song.artistHotttnesss = str(hdf5_getters.get_artist_hotttnesss(songH5File)) song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File)) song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File)) song.artistMbid = str(hdf5_getters.get_artist_mbid(songH5File)) song.barsConfidence = np.array(hdf5_getters.get_bars_confidence(songH5File)) song.barsStart = np.array(hdf5_getters.get_bars_start(songH5File)) song.beatsConfidence = np.array(hdf5_getters.get_beats_confidence(songH5File)) song.beatsStart = np.array(hdf5_getters.get_beats_start(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) song.endOfFadeIn = str(hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.keyConfidence = str(hdf5_getters.get_key_confidence(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.modeConfidence = str(hdf5_getters.get_mode_confidence(songH5File)) song.sectionsConfidence = np.array(hdf5_getters.get_sections_confidence(songH5File)) song.sectionsStart = np.array(hdf5_getters.get_sections_start(songH5File)) song.segmentsConfidence = np.array(hdf5_getters.get_segments_confidence(songH5File)) song.segmentsLoudnessMax = np.array(hdf5_getters.get_segments_loudness_max(songH5File)) song.segmentsLoudnessMaxTime = np.array(hdf5_getters.get_segments_loudness_max_time(songH5File)) song.segmentsLoudnessStart = np.array(hdf5_getters.get_segments_loudness_start(songH5File)) song.segmentsPitches = np.array(hdf5_getters.get_segments_pitches(songH5File)) song.segmentsStart = np.array(hdf5_getters.get_segments_start(songH5File)) song.segmentsTimbre = np.array(hdf5_getters.get_segments_timbre(songH5File)) song.songHotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File)) song.startOfFadeOut = str(hdf5_getters.get_start_of_fade_out(songH5File)) song.tatumsConfidence = np.array(hdf5_getters.get_tatums_confidence(songH5File)) song.tatumsStart = np.array(hdf5_getters.get_tatums_start(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str(hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str(hdf5_getters.get_time_signature_confidence(songH5File)) song.songid = str(hdf5_getters.get_song_id(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) song.artistMbtags = str(hdf5_getters.get_artist_mbtags(songH5File)) #print song count csvRowString += str(song.songCount) + "," csvLabelString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AnalysisSampleRate'.lower(): csvRowString += song.analysisSampleRate elif attribute == 'ArtistFamiliarity'.lower(): csvRowString += song.artistFamiliarity elif attribute == 'ArtistHotttnesss'.lower(): csvRowString += song.artistHotttnesss elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistMbid'.lower(): csvRowString += song.artistMbid elif attribute == 'BarsConfidence'.lower(): arr = song.barsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'BarsStart'.lower(): arr = song.barsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'BeatsConfidence'.lower(): arr = song.beatsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'BeatsStart'.lower(): arr = song.beatsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'EndOfFadeIn'.lower(): csvRowString += song.endOfFadeIn elif attribute == 'Energy'.lower(): csvRowString += song.energy elif attribute == 'Key'.lower(): csvRowString += song.key elif attribute == 'KeyConfidence'.lower(): csvRowString += song.keyConfidence elif attribute == 'Loudness'.lower(): csvRowString += song.loudness elif attribute == 'Mode'.lower(): csvRowString += song.mode elif attribute == 'ModeConfidence'.lower(): csvRowString += song.modeConfidence elif attribute == 'SectionsConfidence'.lower(): arr = song.sectionsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SectionsStart'.lower(): arr = song.sectionsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsConfidence'.lower(): arr = song.segmentsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsLoudnessMax'.lower(): arr = song.segmentsLoudnessMax if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsLoudnessMaxTime'.lower(): arr = song.segmentsLoudnessMaxTime if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsLoudnessStart'.lower(): arr = song.segmentsLoudnessStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SegmentsStart'.lower(): arr = song.segmentsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'SongHotttnesss'.lower(): hotttnesss = song.songHotttnesss if hotttnesss == 'nan': hotttnesss = 'NaN' csvRowString += hotttnesss elif attribute == 'StartOfFadeOut'.lower(): csvRowString += song.startOfFadeOut elif attribute == 'TatumsConfidence'.lower(): arr = song.tatumsConfidence if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'TatumsStart'.lower(): arr = song.tatumsStart if arr.shape[0] == 0: arrmean = '' arrnorm = '' else: arrmean = np.mean(arr) arrnorm = np.linalg.norm(arr) csvRowString += str(arrmean) + ',' + str(arrnorm) elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'SegmentsPitches'.lower(): colmean = np.mean(song.segmentsPitches,axis=0) for m in colmean: csvRowString += str(m) + "," cov = np.dot(song.segmentsPitches.T,song.segmentsPitches) utriind = np.triu_indices(cov.shape[0]) feats = cov[utriind] for feat in feats: csvRowString += str(feat) + "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] elif attribute == 'SegmentsTimbre'.lower(): colmean = np.mean(song.segmentsTimbre,axis=0) for m in colmean: csvRowString += str(m) + "," cov = np.dot(song.segmentsTimbre.T,song.segmentsTimbre) utriind = np.triu_indices(cov.shape[0]) feats = cov[utriind] for feat in feats: csvRowString += str(feat) + "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year elif attribute == 'Decade'.lower(): yr = song.year if yr > 0: decade = song.year[:-1] + '0' else: decade = '0' csvRowString += decade elif attribute == 'ArtistMbtags'.lower(): tags = song.artistMbtags[1:-1] tags = "\"" + tags + "\"" tags = tags.replace("\n",'') csvRowString += tags tagsarray = shlex.split(tags) for t in tagsarray: csvLabelString += t + "," else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," ''' if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',',"") csvRowString += "\"" + albumName + "\"" elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',','') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," ''' #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex-1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" lastIndex = len(csvLabelString) csvLabelString = csvLabelString[0:lastIndex-1] csvLabelString += "\n" outputFile2.write(csvLabelString) csvLabelString = "" songH5File.close() outputFile1.close() outputFile2.close()
def hd5_single_random_file_parser(): # Open an h5 file in read mode h5 = hdf5_getters.open_h5_file_read( '/home/skalogerakis/Documents/MillionSong/MillionSongSubset/A/M/G/TRAMGDX12903CEF79F.h5' ) function_tracker = filter( lambda x: x.startswith('get'), hdf5_getters.__dict__.keys()) # Detects all the getter functions for f in function_tracker: # Print everything in function tracker print(f) # First effort to check what each field contains. print() # 55 available fields (exluding number of songs fields) print("Num of songs -- ", hdf5_getters.get_num_songs(h5)) # One song per file print("Title -- ", hdf5_getters.get_title(h5)) # Print the title of a specific h5 file print("Artist familiarity -- ", hdf5_getters.get_artist_familiarity(h5)) print("Artist hotness -- ", hdf5_getters.get_artist_hotttnesss(h5)) print("Artist ID -- ", hdf5_getters.get_artist_id(h5)) print("Artist mbID -- ", hdf5_getters.get_artist_mbid(h5)) print("Artist playmeid -- ", hdf5_getters.get_artist_playmeid(h5)) print("Artist 7DigitalID -- ", hdf5_getters.get_artist_7digitalid(h5)) print("Artist latitude -- ", hdf5_getters.get_artist_latitude(h5)) print("Artist longitude -- ", hdf5_getters.get_artist_longitude(h5)) print("Artist location -- ", hdf5_getters.get_artist_location(h5)) print("Artist Name -- ", hdf5_getters.get_artist_name(h5)) print("Release -- ", hdf5_getters.get_release(h5)) print("Release 7DigitalID -- ", hdf5_getters.get_release_7digitalid(h5)) print("Song ID -- ", hdf5_getters.get_song_id(h5)) print("Song Hotness -- ", hdf5_getters.get_song_hotttnesss(h5)) print("Track 7Digital -- ", hdf5_getters.get_track_7digitalid(h5)) print("Similar artists -- ", hdf5_getters.get_similar_artists(h5)) print("Artist terms -- ", hdf5_getters.get_artist_terms(h5)) print("Artist terms freq -- ", hdf5_getters.get_artist_terms_freq(h5)) print("Artist terms weight -- ", hdf5_getters.get_artist_terms_weight(h5)) print("Analysis sample rate -- ", hdf5_getters.get_analysis_sample_rate(h5)) print("Audio md5 -- ", hdf5_getters.get_audio_md5(h5)) print("Danceability -- ", hdf5_getters.get_danceability(h5)) print("Duration -- ", hdf5_getters.get_duration(h5)) print("End of Fade -- ", hdf5_getters.get_end_of_fade_in(h5)) print("Energy -- ", hdf5_getters.get_energy(h5)) print("Key -- ", hdf5_getters.get_key(h5)) print("Key Confidence -- ", hdf5_getters.get_key_confidence(h5)) print("Loudness -- ", hdf5_getters.get_loudness(h5)) print("Mode -- ", hdf5_getters.get_mode(h5)) print("Mode Confidence -- ", hdf5_getters.get_mode_confidence(h5)) print("Start of fade out -- ", hdf5_getters.get_start_of_fade_out(h5)) print("Tempo -- ", hdf5_getters.get_tempo(h5)) print("Time signature -- ", hdf5_getters.get_time_signature(h5)) print("Time signature confidence -- ", hdf5_getters.get_time_signature_confidence(h5)) print("Track ID -- ", hdf5_getters.get_track_id(h5)) print("Segments Start -- ", hdf5_getters.get_segments_start(h5)) print("Segments Confidence -- ", hdf5_getters.get_segments_confidence(h5)) print("Segments Pitches -- ", hdf5_getters.get_segments_pitches(h5)) print("Segments Timbre -- ", hdf5_getters.get_segments_timbre(h5)) print("Segments Loudness max -- ", hdf5_getters.get_segments_loudness_max(h5)) print("Segments Loudness max time-- ", hdf5_getters.get_segments_loudness_max_time(h5)) print("Segments Loudness start -- ", hdf5_getters.get_segments_loudness_start(h5)) print("Sections start -- ", hdf5_getters.get_sections_start(h5)) print("Sections Confidence -- ", hdf5_getters.get_sections_confidence(h5)) print("Beats start -- ", hdf5_getters.get_beats_start(h5)) print("Beats confidence -- ", hdf5_getters.get_beats_confidence(h5)) print("Bars start -- ", hdf5_getters.get_bars_start(h5)) print("Bars confidence -- ", hdf5_getters.get_bars_confidence(h5)) print("Tatums start -- ", hdf5_getters.get_tatums_start(h5)) print("Tatums confidence -- ", hdf5_getters.get_tatums_confidence(h5)) print("Artist mbtags -- ", hdf5_getters.get_artist_mbtags(h5)) print("Artist mbtags count -- ", hdf5_getters.get_artist_mbtags_count(h5)) print("Year -- ", hdf5_getters.get_year(h5)) fields = ['Title', 'Artist ID'] with open('Tester2.csv', 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile, delimiter=';') # writing the fields csv_writer.writerow(fields) # writing the data rows csv_writer.writerow( [hdf5_getters.get_title(h5), hdf5_getters.get_artist_id(h5)]) h5.close() # close h5 when completed in the end
def classify(h5): output_array={} # duration duration=hdf5_getters.get_duration(h5) output_array["duration"]=duration ### ADDED VALUE TO ARRAY # number of bars bars=hdf5_getters.get_bars_start(h5) num_bars=len(bars) output_array["num_bars"]=num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length=numpy.ediff1d(bars) variance_bar_length=numpy.var(bar_length) output_array["variance_bar_length"]=variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats=hdf5_getters.get_beats_start(h5) num_beats=len(beats) output_array["num_beats"]=num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length=numpy.ediff1d(beats) variance_beats_length=numpy.var(bar_length) output_array["variance_beats_length"]=variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability=hdf5_getters.get_danceability(h5) output_array["danceability"]=danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"]=end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy=hdf5_getters.get_energy(h5) output_array["energy"]=energy ### ADDED VALUE TO ARRAY # key key=hdf5_getters.get_key(h5) output_array["key"]=int(key) ### ADDED VALUE TO ARRAY # loudness loudness=hdf5_getters.get_loudness(h5) output_array["loudness"]=loudness ### ADDED VALUE TO ARRAY # mode mode=hdf5_getters.get_mode(h5) output_array["mode"]=int(mode) ### ADDED VALUE TO ARRAY # number sections sections=hdf5_getters.get_sections_start(h5) num_sections=len(sections) output_array["num_sections"]=num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length=numpy.ediff1d(sections) variance_sections_length=numpy.var(sections) output_array["variance_sections_length"]=variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments=hdf5_getters.get_segments_start(h5) num_segments=len(segments) output_array["num_segments"]=num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length=numpy.ediff1d(segments) variance_segments_length=numpy.var(segments) output_array["variance_segments_length"]=variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5) segment_loudness_max_index=0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]: segment_loudness_max_index=i segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index] segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index] output_array["segment_loudness_max"]=segment_loudness_max ### ADDED VALUE TO ARRAY output_array["segment_loudness_time"]=segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean=numpy.mean(segment_loudness_array) output_array["segment_loudness_mean"]=segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance=numpy.var(segment_loudness_array) output_array["segment_loudness_variance"]=segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array=hdf5_getters.get_segments_pitches(h5) segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_mean"]=segment_pitches_mean # segment pitches variance (start) segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist() output_array["segment_pitches_variance"]=segment_pitches_variance # segment timbres segment_timbres_array=hdf5_getters.get_segments_timbre(h5) segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_mean"]=segment_timbres_mean # segment timbres variance (start) segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist() output_array["segment_timbres_variance"]=segment_timbres_variance # hotttnesss hottness=hdf5_getters.get_song_hotttnesss(h5,0) output_array["hottness"]=hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5) fade_out=duration-start_of_fade_out output_array["fade_out"]=fade_out ### ADDED VALUE TO ARRAY # tatums tatums=hdf5_getters.get_tatums_start(h5) num_tatums=len(tatums) output_array["num_tatums"]=num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length=numpy.ediff1d(tatums) variance_tatums_length=numpy.var(tatums_length) output_array["variance_tatums_length"]=variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo=hdf5_getters.get_tempo(h5) output_array["tempo"]=tempo ### ADDED VALUE TO ARRAY # time signature time_signature=hdf5_getters.get_time_signature(h5) output_array["time_signature"]=int(time_signature) ### ADDED VALUE TO ARRAY # year year=hdf5_getters.get_year(h5) output_array["year"]=int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms=hdf5_getters.get_artist_terms(h5,0) output_array["artist_terms"]=artist_terms.tolist() artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0) output_array["artist_terms_freq"]=artist_terms_freq.tolist() artist_name=hdf5_getters.get_artist_name(h5,0) output_array["artist_name"]=artist_name artist_id=hdf5_getters.get_artist_id(h5,0) output_array["artist_id"]=artist_id # title title=hdf5_getters.get_title(h5,0) output_array["title"]=title return output_array
def classify(h5): output_array = {} # duration duration = hdf5_getters.get_duration(h5) output_array["duration"] = duration ### ADDED VALUE TO ARRAY # number of bars bars = hdf5_getters.get_bars_start(h5) num_bars = len(bars) output_array["num_bars"] = num_bars ### ADDED VALUE TO ARRAY # mean and variance in bar length bar_length = numpy.ediff1d(bars) variance_bar_length = numpy.var(bar_length) output_array[ "variance_bar_length"] = variance_bar_length ### ADDED VALUE TO ARRAY # number of beats beats = hdf5_getters.get_beats_start(h5) num_beats = len(beats) output_array["num_beats"] = num_beats ### ADDED VALUE TO ARRAY # mean and variance in beats length beats_length = numpy.ediff1d(beats) variance_beats_length = numpy.var(bar_length) output_array[ "variance_beats_length"] = variance_beats_length ### ADDED VALUE TO ARRAY # danceability danceability = hdf5_getters.get_danceability(h5) output_array["danceability"] = danceability ### ADDED VALUE TO ARRAY # end of fade in end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5) output_array["end_of_fade_in"] = end_of_fade_in ### ADDED VALUE TO ARRAY # energy energy = hdf5_getters.get_energy(h5) output_array["energy"] = energy ### ADDED VALUE TO ARRAY # key key = hdf5_getters.get_key(h5) output_array["key"] = int(key) ### ADDED VALUE TO ARRAY # loudness loudness = hdf5_getters.get_loudness(h5) output_array["loudness"] = loudness ### ADDED VALUE TO ARRAY # mode mode = hdf5_getters.get_mode(h5) output_array["mode"] = int(mode) ### ADDED VALUE TO ARRAY # number sections sections = hdf5_getters.get_sections_start(h5) num_sections = len(sections) output_array["num_sections"] = num_sections ### ADDED VALUE TO ARRAY # mean and variance in sections length sections_length = numpy.ediff1d(sections) variance_sections_length = numpy.var(sections) output_array[ "variance_sections_length"] = variance_sections_length ### ADDED VALUE TO ARRAY # number segments segments = hdf5_getters.get_segments_start(h5) num_segments = len(segments) output_array["num_segments"] = num_segments ### ADDED VALUE TO ARRAY # mean and variance in segments length segments_length = numpy.ediff1d(segments) variance_segments_length = numpy.var(segments) output_array[ "variance_segments_length"] = variance_segments_length ### ADDED VALUE TO ARRAY # segment loudness max segment_loudness_max_array = hdf5_getters.get_segments_loudness_max(h5) segment_loudness_max_time_array = hdf5_getters.get_segments_loudness_max_time( h5) segment_loudness_max_index = 0 for i in range(len(segment_loudness_max_array)): if segment_loudness_max_array[i] > segment_loudness_max_array[ segment_loudness_max_index]: segment_loudness_max_index = i segment_loudness_max = segment_loudness_max_array[ segment_loudness_max_index] segment_loudness_max_time = segment_loudness_max_time_array[ segment_loudness_max_index] output_array[ "segment_loudness_max"] = segment_loudness_max ### ADDED VALUE TO ARRAY output_array[ "segment_loudness_time"] = segment_loudness_max_time ### ADDED VALUE TO ARRAY # POSSIBLE TODO: use average function instead and weight by segment length # segment loudness mean (start) segment_loudness_array = hdf5_getters.get_segments_loudness_start(h5) segment_loudness_mean = numpy.mean(segment_loudness_array) output_array[ "segment_loudness_mean"] = segment_loudness_mean ### ADDED VALUE TO ARRAY # segment loudness variance (start) segment_loudness_variance = numpy.var(segment_loudness_array) output_array[ "segment_loudness_variance"] = segment_loudness_variance ### ADDED VALUE TO ARRAY # segment pitches segment_pitches_array = hdf5_getters.get_segments_pitches(h5) segment_pitches_mean = numpy.mean(segment_pitches_array, axis=0).tolist() output_array["segment_pitches_mean"] = segment_pitches_mean # segment pitches variance (start) segment_pitches_variance = numpy.var(segment_pitches_array, axis=0).tolist() output_array["segment_pitches_variance"] = segment_pitches_variance # segment timbres segment_timbres_array = hdf5_getters.get_segments_timbre(h5) segment_timbres_mean = numpy.mean(segment_timbres_array, axis=0).tolist() output_array["segment_timbres_mean"] = segment_timbres_mean # segment timbres variance (start) segment_timbres_variance = numpy.var(segment_timbres_array, axis=0).tolist() output_array["segment_timbres_variance"] = segment_timbres_variance # hotttnesss hottness = hdf5_getters.get_song_hotttnesss(h5, 0) output_array["hottness"] = hottness ### ADDED VALUE TO ARRAY # duration-start of fade out start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5) fade_out = duration - start_of_fade_out output_array["fade_out"] = fade_out ### ADDED VALUE TO ARRAY # tatums tatums = hdf5_getters.get_tatums_start(h5) num_tatums = len(tatums) output_array["num_tatums"] = num_tatums ### ADDED VALUE TO ARRAY # mean and variance in tatums length tatums_length = numpy.ediff1d(tatums) variance_tatums_length = numpy.var(tatums_length) output_array[ "variance_tatums_length"] = variance_tatums_length ### ADDED VALUE TO ARRAY # tempo tempo = hdf5_getters.get_tempo(h5) output_array["tempo"] = tempo ### ADDED VALUE TO ARRAY # time signature time_signature = hdf5_getters.get_time_signature(h5) output_array["time_signature"] = int( time_signature) ### ADDED VALUE TO ARRAY # year year = hdf5_getters.get_year(h5) output_array["year"] = int(year) ### ADDED VALUE TO ARRAY # artist terms artist_terms = hdf5_getters.get_artist_terms(h5, 0) output_array["artist_terms"] = artist_terms.tolist() artist_terms_freq = hdf5_getters.get_artist_terms_freq(h5, 0) output_array["artist_terms_freq"] = artist_terms_freq.tolist() artist_name = hdf5_getters.get_artist_name(h5, 0) output_array["artist_name"] = artist_name artist_id = hdf5_getters.get_artist_id(h5, 0) output_array["artist_id"] = artist_id # title title = hdf5_getters.get_title(h5, 0) output_array["title"] = title return output_array
def data_to_flat_file(basedir,ext='.h5') : """This function extract the information from the tables and creates the flat file.""" count = 0; #song counter list_to_write= [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root,'*'+ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title= title.replace('"','') comma=title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album= album.replace('"','') #eliminating commas in the album comma=album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma=artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name= artist_name.replace('"','') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam=-1 artist_hotness= hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness=-1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat=-1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,"); if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon=-1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability=-1 end_fade_in =hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in=-1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy=-1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c=-1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness=-1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf=-1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot=-1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo=-1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c=-1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg= get_avg(bars_c) bars_c_max= get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev= get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max= get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev= get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg= get_avg(beats_c) beats_c_max= get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev= get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max= get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev= get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg= get_avg(sec_c) sec_c_max= get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev= get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max= get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev= get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg= get_avg(seg_c) seg_c_max= get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev= get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg= get_avg(seg_loud_max) seg_loud_max_max= get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev= get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg= get_avg(seg_loud_max_time) seg_loud_max_time_max= get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev= get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg= get_avg(seg_loud_start) seg_loud_start_max= get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev= get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg= get_avg(seg_start) seg_start_max= get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev= get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg= get_avg(tatms_c) tatms_c_max= get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev= get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg= get_avg(tatms_start) tatms_start_max= get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev= get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq final_genre=[] genres_so_far=[] for i in range(len(genre_indexes)): genre_tmp=get_genre(art_trm,genre_indexes[i]) #genre that corresponds to the highest freq genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set=1 #genre was found in dictionary if genre_set == 1: col_num=[] for genre in final_genre: column=int(genre) #getting the column number of the genre col_num.append(column) genre_array=genre_columns(col_num) #genre array else: genre_array=genre_columns(-1) #the genre was not found in the dictionary transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg=[] seg_pitch_max=[] seg_pitch_min=[] seg_pitch_stddev=[] seg_pitch_count=[] seg_pitch_sum=[] i=0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i=i+1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg=[] seg_timbre_max=[] seg_timbre_min=[] seg_timbre_stddev=[] seg_timbre_count=[] seg_timbre_sum=[] i=0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i=i+1 #Writing to the flat file writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo]) h5.close() count=count+1; print count;
def parse_aggregate_songs(file_name,file_name2,artist_map): """ Given an aggregate filename and artist_map in the format {artist_name: {data pertaining to artist}} """ """ TODO: -this function goes through each song, if artist not in there, add all data necesary and add first song info. else update any specific song info -song info is a map from attributename:[values] """ #artist_map = {} h5 = hdf5_getters.open_h5_file_read(file_name) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file...' for i in range(numSongs): artist_name = hdf5_getters.get_artist_name(h5,i) #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr) h5 = hdf5_getters.open_h5_file_read(file_name2) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file2...' for i in range(numSongs): song_id = hdf5_getters.get_track_id(h5,i) artist_name = hdf5_getters.get_artist_name(h5,i) if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']: continue #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr)
connection.commit() glob_path = '/Users/kenleejr92/MillionSongSubset/data/*/*/*/*' filepaths = glob.glob(glob_path) for filepath in tqdm(filepaths): h5 = hdf5_getters.open_h5_file_read(filepath) n = hdf5_getters.get_num_songs(h5) for row in range(n): song_id = hdf5_getters.get_song_id(h5, songidx=row).decode('UTF-8') # artist = hdf5_getters.get_artist_name(h5,songidx=row).decode('UTF-8') # title= hdf5_getters.get_title(h5,songidx=row)#.decode('UTF-8') # artist = "".join(c for c in unicodedata.normalize('NFD', str(artist.decode("utf8"))) if unicodedata.category(c) != "Mn") # title = "".join(c for c in unicodedata.normalize('NFD', str(title.decode("utf8"))) if unicodedata.category(c) != "Mn") #single number features danceability = hdf5_getters.get_danceability(h5, songidx=row) duration = hdf5_getters.get_duration(h5, songidx=row) energy = hdf5_getters.get_energy(h5, songidx=row) loudness = hdf5_getters.get_loudness(h5, songidx=row) musicalKey = hdf5_getters.get_key(h5, songidx=row) mode = hdf5_getters.get_mode(h5, songidx=row) tempo = hdf5_getters.get_tempo(h5, songidx=row) time_signature = hdf5_getters.get_time_signature(h5, songidx=row) year = hdf5_getters.get_year(h5, songidx=row) song_hottness = hdf5_getters.get_song_hotttnesss(h5, songidx=row) end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5, songidx=row) start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5, songidx=row) #timestamp features #take last element and divide by length to get beats/unit time, segments/unit_time bars_start = hdf5_getters.get_bars_start(h5, songidx=row)
def apply_to_all_files_mod(basedir, song_list, filename='songs.npy', func=lambda x: x, ext='.h5'): """ From a base directory, goes through all subdirectories, finds all files with the given ext, and reads each song from each file. For each song in song_list, gets the title, artist, tempo, familiarity, hottness, terms, dancebility, duration, energy, loudness, and the timbre matrix. Tab delimits terms, flattens the timbre matrix, adds them all to a np array, and saves the array with the information from each song to filename. """ #Initial list of desired song info csv_data = [] count = 0 done_gg = False song_dict = construct_song_dict(song_list) # iterate over all files in all subdirectories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) #Iterates through each file in files for filename in files: count += 1 if count % 1000 == 0: print count h5 = GETTERS.open_h5_file_read(filename) #Scrapes desired data title = GETTERS.get_title(h5) artist = GETTERS.get_artist_name(h5) tempo = GETTERS.get_tempo(h5) familiarity = GETTERS.get_artist_familiarity(h5) hotness = GETTERS.get_artist_hotttnesss(h5) terms = GETTERS.get_artist_terms(h5) danceability = GETTERS.get_danceability(h5) duration = GETTERS.get_duration(h5) energy = GETTERS.get_energy(h5) loudness = GETTERS.get_loudness(h5) timbre = GETTERS.get_segments_timbre(h5) #Tab delimits terms terms_tabs = "\t".join(terms) #Flattens timbre timbre_flattened = timbre.flatten() #Creates np array of everything but timbre matrix everything_but_timbre = np.array([ title, artist, tempo, familiarity, hotness, terms_tabs, danceability, duration, energy, loudness ]) #Combines everything else with timbre matrix row = np.concatenate((everything_but_timbre, timbre_flattened)) #Checks if artist, song combination was in the list and, if so, adds it. if artist in song_dict: if title in song_dict[artist]: print("Adding {} by {}. Song ID is: {}".format( title, artist, GETTERS.get_song_id(h5))) csv_data.append(row) #Prevents duplicates song_dict[artist][song_dict[artist].index(title)] = '' h5.close() print("Number of songs: {}, artists {}".format(len(csv_data), len(song_dict))) csv_array = np.array(csv_data) #Saves data np.save(filename, csv_array)
def main(): basedir = "./../songMetaInfo.txt" ext = ".h5" if len(sys.argv) > 1: basedir = sys.argv[1] outputfile = 'SongFileMetaData.csv' if len(sys.argv) > 2: outputfile = sys.argv[2] csvWriter = open(outputfile, 'w') csvWriter.write( "title,songId,artistId,artistfamilarity,artistHotness,songHotness," + "songEnfOfFadeIn,startFadeout,energy,loudness,albumID,albumName,artistName,danceability,duration,keySignatureConfidence,tempo,timeSignature,timeSignatureConfidence,year\n" ) with open(basedir) as file: for line in file.readlines(): f = line.strip() #newf = f + "text" print f #print f try: songH5File = hdf5_getters.open_h5_file_read(f) csvStr = "" #0 title = str(hdf5_getters.get_title(songH5File)) csvStr += title + "," #1 songId = str(hdf5_getters.get_song_id(songH5File)) csvStr += songId + "," #2 artistId = str(hdf5_getters.get_artist_id(songH5File)) csvStr += artistId + "," #3 artistfamilarity = str( hdf5_getters.get_artist_familiarity(songH5File)) csvStr += artistfamilarity + "," #4 artistHotness = str( hdf5_getters.get_artist_hotttnesss(songH5File)) csvStr += artistHotness + "," #5 songHotness = str(hdf5_getters.get_song_hotttnesss(songH5File)) csvStr += songHotness + "," #6 songEnfOfFadeIn = str( hdf5_getters.get_end_of_fade_in(songH5File)) csvStr += songEnfOfFadeIn + "," #7 startFadeOut = str( hdf5_getters.get_start_of_fade_out(songH5File)) csvStr += startFadeOut + "," #8 energy = str(hdf5_getters.get_energy(songH5File)) csvStr += energy + "," #9 loudness = str(hdf5_getters.get_loudness(songH5File)) csvStr += loudness + "," #10 albumID = str(hdf5_getters.get_release_7digitalid(songH5File)) csvStr += albumID + "," #11 albumName = str(hdf5_getters.get_release(songH5File)) csvStr += albumName + "," #12 artistName = str(hdf5_getters.get_artist_name(songH5File)) csvStr += artistName + "," #13 danceability = str(hdf5_getters.get_danceability(songH5File)) csvStr += danceability + "," #14 duration = str(hdf5_getters.get_duration(songH5File)) csvStr += duration + "," #15 keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File)) csvStr += keySignatureConfidence + "," #16 tempo = str(hdf5_getters.get_tempo(songH5File)) csvStr += tempo + "," ## 17 timeSignature = str( hdf5_getters.get_time_signature(songH5File)) csvStr += timeSignature + "," #18 timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) csvStr += timeSignatureConfidence + "," #19 year = str(hdf5_getters.get_year(songH5File)) csvStr += year + "," #print song count csvStr += "\n" csvWriter.write(csvStr) #print csvStr songH5File.close() except: print "Error in processing file" csvWriter.close()
for parent_folder in os.listdir(path): for sub_folder in os.listdir(path + '/' + parent_folder): for child_folder in os.listdir(path + '/' + parent_folder + '/' + sub_folder): for file in os.listdir(path + '/' + parent_folder + '/' + sub_folder + '/' + child_folder): with h5.open_h5_file_read(path + '/' + parent_folder + '/' + sub_folder + '/' + child_folder + '/' + file) as ds: row = [] row += [h5.get_artist_terms(ds)] row += [h5.get_artist_terms_freq(ds)] row += [h5.get_artist_terms_weight(ds)] row += [h5.get_danceability(ds)] row += [h5.get_energy(ds)] row += [h5.get_key(ds)] row += [h5.get_mode(ds)] row += [h5.get_loudness(ds)] row += [ parent_folder + '/' + sub_folder + '/' + child_folder + '/' ] row += [file] row += [h5.get_duration(ds)] row += [h5.get_artist_familiarity(ds)] row += [h5.get_similar_artists(ds)] row += [h5.get_artist_id(ds)] row += [h5.get_title(ds)]
def data_to_flat_file(basedir, ext='.h5'): """This function extract the information from the tables and creates the flat file.""" count = 0 #song counter list_to_write = [] row_to_write = "" writer = csv.writer(open("metadata_wholeA.csv", "wb")) for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print f #the name of the file h5 = hdf5_getters.open_h5_file_read(f) title = hdf5_getters.get_title(h5) title = title.replace('"', '') comma = title.find(',') #eliminating commas in the title if comma != -1: print title time.sleep(1) album = hdf5_getters.get_release(h5) album = album.replace('"', '') #eliminating commas in the album comma = album.find(',') if comma != -1: print album time.sleep(1) artist_name = hdf5_getters.get_artist_name(h5) comma = artist_name.find(',') if comma != -1: print artist_name time.sleep(1) artist_name = artist_name.replace('"', '') #eliminating double quotes duration = hdf5_getters.get_duration(h5) samp_rt = hdf5_getters.get_analysis_sample_rate(h5) artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5) artist_fam = hdf5_getters.get_artist_familiarity(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_fam) == True: artist_fam = -1 artist_hotness = hdf5_getters.get_artist_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_hotness) == True: artist_hotness = -1 artist_id = hdf5_getters.get_artist_id(h5) artist_lat = hdf5_getters.get_artist_latitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lat) == True: artist_lat = -1 artist_loc = hdf5_getters.get_artist_location(h5) #checks artist_loc to see if it is a hyperlink if it is set as empty string artist_loc = artist_loc.replace(",", "\,") if artist_loc.startswith("<a"): artist_loc = "" if len(artist_loc) > 100: artist_loc = "" artist_lon = hdf5_getters.get_artist_longitude(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(artist_lon) == True: artist_lon = -1 artist_mbid = hdf5_getters.get_artist_mbid(h5) artist_pmid = hdf5_getters.get_artist_playmeid(h5) audio_md5 = hdf5_getters.get_audio_md5(h5) danceability = hdf5_getters.get_danceability(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(danceability) == True: danceability = -1 end_fade_in = hdf5_getters.get_end_of_fade_in(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(end_fade_in) == True: end_fade_in = -1 energy = hdf5_getters.get_energy(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(energy) == True: energy = -1 song_key = hdf5_getters.get_key(h5) key_c = hdf5_getters.get_key_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(key_c) == True: key_c = -1 loudness = hdf5_getters.get_loudness(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(loudness) == True: loudness = -1 mode = hdf5_getters.get_mode(h5) mode_conf = hdf5_getters.get_mode_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(mode_conf) == True: mode_conf = -1 release_7digitalid = hdf5_getters.get_release_7digitalid(h5) song_hot = hdf5_getters.get_song_hotttnesss(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(song_hot) == True: song_hot = -1 song_id = hdf5_getters.get_song_id(h5) start_fade_out = hdf5_getters.get_start_of_fade_out(h5) tempo = hdf5_getters.get_tempo(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(tempo) == True: tempo = -1 time_sig = hdf5_getters.get_time_signature(h5) time_sig_c = hdf5_getters.get_time_signature_confidence(h5) #checking if we get a "nan" if we do we change it to -1 if numpy.isnan(time_sig_c) == True: time_sig_c = -1 track_id = hdf5_getters.get_track_id(h5) track_7digitalid = hdf5_getters.get_track_7digitalid(h5) year = hdf5_getters.get_year(h5) bars_c = hdf5_getters.get_bars_confidence(h5) bars_c_avg = get_avg(bars_c) bars_c_max = get_max(bars_c) bars_c_min = get_min(bars_c) bars_c_stddev = get_stddev(bars_c) bars_c_count = get_count(bars_c) bars_c_sum = get_sum(bars_c) bars_start = hdf5_getters.get_bars_start(h5) bars_start_avg = get_avg(bars_start) bars_start_max = get_max(bars_start) bars_start_min = get_min(bars_start) bars_start_stddev = get_stddev(bars_start) bars_start_count = get_count(bars_start) bars_start_sum = get_sum(bars_start) beats_c = hdf5_getters.get_beats_confidence(h5) beats_c_avg = get_avg(beats_c) beats_c_max = get_max(beats_c) beats_c_min = get_min(beats_c) beats_c_stddev = get_stddev(beats_c) beats_c_count = get_count(beats_c) beats_c_sum = get_sum(beats_c) beats_start = hdf5_getters.get_beats_start(h5) beats_start_avg = get_avg(beats_start) beats_start_max = get_max(beats_start) beats_start_min = get_min(beats_start) beats_start_stddev = get_stddev(beats_start) beats_start_count = get_count(beats_start) beats_start_sum = get_sum(beats_start) sec_c = hdf5_getters.get_sections_confidence(h5) sec_c_avg = get_avg(sec_c) sec_c_max = get_max(sec_c) sec_c_min = get_min(sec_c) sec_c_stddev = get_stddev(sec_c) sec_c_count = get_count(sec_c) sec_c_sum = get_sum(sec_c) sec_start = hdf5_getters.get_sections_start(h5) sec_start_avg = get_avg(sec_start) sec_start_max = get_max(sec_start) sec_start_min = get_min(sec_start) sec_start_stddev = get_stddev(sec_start) sec_start_count = get_count(sec_start) sec_start_sum = get_sum(sec_start) seg_c = hdf5_getters.get_segments_confidence(h5) seg_c_avg = get_avg(seg_c) seg_c_max = get_max(seg_c) seg_c_min = get_min(seg_c) seg_c_stddev = get_stddev(seg_c) seg_c_count = get_count(seg_c) seg_c_sum = get_sum(seg_c) seg_loud_max = hdf5_getters.get_segments_loudness_max(h5) seg_loud_max_avg = get_avg(seg_loud_max) seg_loud_max_max = get_max(seg_loud_max) seg_loud_max_min = get_min(seg_loud_max) seg_loud_max_stddev = get_stddev(seg_loud_max) seg_loud_max_count = get_count(seg_loud_max) seg_loud_max_sum = get_sum(seg_loud_max) seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5) seg_loud_max_time_avg = get_avg(seg_loud_max_time) seg_loud_max_time_max = get_max(seg_loud_max_time) seg_loud_max_time_min = get_min(seg_loud_max_time) seg_loud_max_time_stddev = get_stddev(seg_loud_max_time) seg_loud_max_time_count = get_count(seg_loud_max_time) seg_loud_max_time_sum = get_sum(seg_loud_max_time) seg_loud_start = hdf5_getters.get_segments_loudness_start(h5) seg_loud_start_avg = get_avg(seg_loud_start) seg_loud_start_max = get_max(seg_loud_start) seg_loud_start_min = get_min(seg_loud_start) seg_loud_start_stddev = get_stddev(seg_loud_start) seg_loud_start_count = get_count(seg_loud_start) seg_loud_start_sum = get_sum(seg_loud_start) seg_pitch = hdf5_getters.get_segments_pitches(h5) pitch_size = len(seg_pitch) seg_start = hdf5_getters.get_segments_start(h5) seg_start_avg = get_avg(seg_start) seg_start_max = get_max(seg_start) seg_start_min = get_min(seg_start) seg_start_stddev = get_stddev(seg_start) seg_start_count = get_count(seg_start) seg_start_sum = get_sum(seg_start) seg_timbre = hdf5_getters.get_segments_timbre(h5) tatms_c = hdf5_getters.get_tatums_confidence(h5) tatms_c_avg = get_avg(tatms_c) tatms_c_max = get_max(tatms_c) tatms_c_min = get_min(tatms_c) tatms_c_stddev = get_stddev(tatms_c) tatms_c_count = get_count(tatms_c) tatms_c_sum = get_sum(tatms_c) tatms_start = hdf5_getters.get_tatums_start(h5) tatms_start_avg = get_avg(tatms_start) tatms_start_max = get_max(tatms_start) tatms_start_min = get_min(tatms_start) tatms_start_stddev = get_stddev(tatms_start) tatms_start_count = get_count(tatms_start) tatms_start_sum = get_sum(tatms_start) #Getting the genres genre_set = 0 #flag to see if the genre has been set or not art_trm = hdf5_getters.get_artist_terms(h5) trm_freq = hdf5_getters.get_artist_terms_freq(h5) trn_wght = hdf5_getters.get_artist_terms_weight(h5) a_mb_tags = hdf5_getters.get_artist_mbtags(h5) genre_indexes = get_genre_indexes( trm_freq) #index of the highest freq final_genre = [] genres_so_far = [] for i in range(len(genre_indexes)): genre_tmp = get_genre( art_trm, genre_indexes[i] ) #genre that corresponds to the highest freq genres_so_far = genre_dict.get_genre_in_dict( genre_tmp) #getting the genre from the dictionary if len(genres_so_far) != 0: for i in genres_so_far: final_genre.append(i) genre_set = 1 #genre was found in dictionary if genre_set == 1: col_num = [] for genre in final_genre: column = int( genre) #getting the column number of the genre col_num.append(column) genre_array = genre_columns(col_num) #genre array else: genre_array = genre_columns( -1) #the genre was not found in the dictionary transpose_pitch = seg_pitch.transpose( ) #this is to tranpose the matrix,so we can have 12 rows #arrays containing the aggregate values of the 12 rows seg_pitch_avg = [] seg_pitch_max = [] seg_pitch_min = [] seg_pitch_stddev = [] seg_pitch_count = [] seg_pitch_sum = [] i = 0 #Getting the aggregate values in the pitches array for row in transpose_pitch: seg_pitch_avg.append(get_avg(row)) seg_pitch_max.append(get_max(row)) seg_pitch_min.append(get_min(row)) seg_pitch_stddev.append(get_stddev(row)) seg_pitch_count.append(get_count(row)) seg_pitch_sum.append(get_sum(row)) i = i + 1 #extracting information from the timbre array transpose_timbre = seg_pitch.transpose( ) #tranposing matrix, to have 12 rows #arrays containing the aggregate values of the 12 rows seg_timbre_avg = [] seg_timbre_max = [] seg_timbre_min = [] seg_timbre_stddev = [] seg_timbre_count = [] seg_timbre_sum = [] i = 0 for row in transpose_timbre: seg_timbre_avg.append(get_avg(row)) seg_timbre_max.append(get_max(row)) seg_timbre_min.append(get_min(row)) seg_timbre_stddev.append(get_stddev(row)) seg_timbre_count.append(get_count(row)) seg_timbre_sum.append(get_sum(row)) i = i + 1 #Writing to the flat file writer.writerow([ title, album, artist_name, year, duration, seg_start_count, tempo ]) h5.close() count = count + 1 print count
f2 = track_id[3] f3 = track_id[4] hdf5path = "MillionSongSubset/data/"+f1+"/"+f2+"/"+f3+"/"+track_id+".h5" total_tracks += 1 # Check existence of the HDF5 file if not os.path.isfile(hdf5path): continue #print('ERROR: file',hdf5path,'does not exist.') #sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) # Retrieve features from HDF5 danceability = hdf5_getters.get_danceability(h5) duration = hdf5_getters.get_duration(h5) time_of_fade_in = hdf5_getters.get_end_of_fade_in(h5) energy = hdf5_getters.get_energy(h5) key = hdf5_getters.get_key(h5) key_confidence = hdf5_getters.get_key_confidence(h5) loudness = hdf5_getters.get_loudness(h5) mode = hdf5_getters.get_mode(h5) mode_confidence = hdf5_getters.get_mode_confidence(h5) sections_start = hdf5_getters.get_sections_start(h5) num_sections = len(sections_start) if num_sections == 0: h5.close() continue segments_loudness_max = hdf5_getters.get_segments_loudness_max(h5) segments_loudness_start = hdf5_getters.get_segments_loudness_start(h5)
# Get artist location artist_location = hdf5_getters.get_artist_location(h5) artist_loc = artist_location.translate(None, string.punctuation) # Get release release_song = hdf5_getters.get_release(h5) release = release_song.translate(None, string.punctuation) # Get artist HOTTTNESSSSSS hotttness = hdf5_getters.get_artist_hotttnesss(h5) # Get artist familiarity familiarity = hdf5_getters.get_artist_familiarity(h5) # Get danceability danceability = hdf5_getters.get_danceability(h5) # Get duration duration = hdf5_getters.get_duration(h5) # Get energy #*****useless... column is filled with 0's? energy = hdf5_getters.get_energy(h5) # Get loudness loudness = hdf5_getters.get_loudness(h5) # Get year year = hdf5_getters.get_year(h5) # Get tempo
def fill_attributes(song, songH5File): #----------------------------non array attributes------------------------------- song.analysisSampleRate = str( hdf5_getters.get_analysis_sample_rate(songH5File)) song.artistDigitalID = str(hdf5_getters.get_artist_7digitalid(songH5File)) song.artistFamiliarity = str( hdf5_getters.get_artist_familiarity(songH5File)) song.artistHotness = str(hdf5_getters.get_artist_hottness(songH5File)) song.artistID = str(hdf5_getters.get_artist_id(songH5File)) song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File)) song.artistLocation = str(hdf5_getters.get_artist_location(songH5File)) song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File)) song.artistmbID = str(hdf5_getters.get_artist_mbid(songH5File)) song.artistName = str(hdf5_getters.get_artist_name(songH5File)) song.artistPlayMeID = str(hdf5_getters.get_artist_playmeid(songH5File)) song.audioMD5 = str(hdf5_getters.get_audio_md5(songH5File)) song.danceability = str(hdf5_getters.get_danceability(songH5File)) song.duration = str(hdf5_getters.get_duration(songH5File)) song.endOfFadeIn = str(hdf5_getters.get_end_of_fade_in(songH5File)) song.energy = str(hdf5_getters.get_energy(songH5File)) song.key = str(hdf5_getters.get_key(songH5File)) song.keyConfidence = str(hdf5_getters.get_key_confidence(songH5File)) song.segementsConfidence = str( hdf5_getters.get_segments_confidence(songH5File)) song.segementsConfidence = str( hdf5_getters.get_sections_confidence(songH5File)) song.loudness = str(hdf5_getters.get_loudness(songH5File)) song.mode = str(hdf5_getters.get_mode(songH5File)) song.modeConfidence = str(hdf5_getters.get_mode_confidence(songH5File)) song.release = str(hdf5_getters.get_release(songH5File)) song.releaseDigitalID = str( hdf5_getters.get_release_7digitalid(songH5File)) song.songHotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File)) song.startOfFadeOut = str(hdf5_getters.get_start_of_fade_out(songH5File)) song.tempo = str(hdf5_getters.get_tempo(songH5File)) song.timeSignature = str(hdf5_getters.get_time_signature(songH5File)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence(songH5File)) song.title = str(hdf5_getters.get_title(songH5File)) song.trackID = str(hdf5_getters.get_track_id(songH5File)) song.trackDigitalID = str(hdf5_getters.get_track_7digitalid(songH5File)) song.year = str(hdf5_getters.get_year(songH5File)) #-------------------------------array attributes-------------------------------------- #array float song.beatsStart_mean, song.beatsStart_var = convert_array_to_meanvar( hdf5_getters.get_beats_start(songH5File)) #array float song.artistTermsFreq_mean, song.artistTermsFreq_var = convert_array_to_meanvar( hdf5_getters.get_artist_terms_freq(songH5File)) #array float song.artistTermsWeight_mean, song.artistTermsWeight_var = convert_array_to_meanvar( hdf5_getters.get_artist_terms_weight(songH5File)) #array int song.artistmbTagsCount_mean, song.artistmbTagsCount_var = convert_array_to_meanvar( hdf5_getters.get_artist_mbtags_count(songH5File)) #array float song.barsConfidence_mean, song.barsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_bars_confidence(songH5File)) #array float song.barsStart_mean, song.barsStart_var = convert_array_to_meanvar( hdf5_getters.get_bars_start(songH5File)) #array float song.beatsConfidence_mean, song.beatsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_beats_confidence(songH5File)) #array float song.sectionsConfidence_mean, song.sectionsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_sections_confidence(songH5File)) #array float song.sectionsStart_mean, song.sectionsStart_var = convert_array_to_meanvar( hdf5_getters.get_sections_start(songH5File)) #array float song.segmentsConfidence_mean, song.segmentsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_segments_confidence(songH5File)) #array float song.segmentsLoudness_mean, song.segmentsLoudness_var = convert_array_to_meanvar( hdf5_getters.get_segments_loudness_max(songH5File)) #array float song.segmentsLoudnessMaxTime_mean, song.segmentsLoudnessMaxTime_var = convert_array_to_meanvar( hdf5_getters.get_segments_loudness_max_time(songH5File)) #array float song.segmentsLoudnessMaxStart_mean, song.segmentsLoudnessMaxStart_var = convert_array_to_meanvar( hdf5_getters.get_segments_loudness_start(songH5File)) #array float song.segmentsStart_mean, song.segmentsStart_var = convert_array_to_meanvar( hdf5_getters.get_segments_start(songH5File)) #array float song.tatumsConfidence_mean, song.tatumsConfidence_var = convert_array_to_meanvar( hdf5_getters.get_tatums_confidence(songH5File)) #array float song.tatumsStart_mean, song.tatumsStart_var = convert_array_to_meanvar( hdf5_getters.get_tatums_start(songH5File)) #array2d float song.segmentsTimbre_mean, song.segmentsTimbre_var = covert_2darray_to_meanvar( hdf5_getters.get_segments_timbre(songH5File)) #array2d float song.segmentsPitches_mean, song.segmentsPitches_var = covert_2darray_to_meanvar( hdf5_getters.get_segments_pitches(songH5File)) #------------------------array string attributes------------------------ song.similarArtists = convert_array_to_string( hdf5_getters.get_similar_artists(songH5File)) #array string song.artistTerms = convert_array_to_string( hdf5_getters.get_artist_terms(songH5File)) #array string song.artistmbTags = convert_array_to_string( hdf5_getters.get_artist_mbtags(songH5File)) #array string return song
def main(): outputFile1 = open('SongCSV.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ( "SongID,AlbumID,AlbumName,TrackId,ArtistID,ArtistLatitude,ArtistLocation," + "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," + "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence," + "Title,Year") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() outputFile1.write("SongNumber,") outputFile1.write(csvRowString + "\n") csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "/home/umwangye/millonsong/MillionSongSubset/data/" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) #song = Song(str(hdf5_getters.get_song_id(songH5File))) #testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) numPerH5 = hdf5_getters.get_num_songs(songH5File) for cnt in range(numPerH5): song = Song(str(hdf5_getters.get_song_id(songH5File, cnt))) song.trackId = str(hdf5_getters.get_track_id(songH5File, cnt)) song.artistID = str(hdf5_getters.get_artist_id( songH5File, cnt)) song.albumID = str( hdf5_getters.get_release_7digitalid(songH5File, cnt)) song.albumName = str(hdf5_getters.get_release(songH5File, cnt)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File, cnt)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File, cnt)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File, cnt)) song.artistName = str( hdf5_getters.get_artist_name(songH5File, cnt)) song.danceability = str( hdf5_getters.get_danceability(songH5File, cnt)) song.duration = str(hdf5_getters.get_duration(songH5File, cnt)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File, cnt)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File, cnt)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File, cnt)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File, cnt)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence( songH5File, cnt)) song.title = str(hdf5_getters.get_title(songH5File, cnt)) song.year = str(hdf5_getters.get_year(songH5File, cnt)) #print song count csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',', "") csvRowString += "\"" + albumName + "\"" elif attribute == 'TrackId'.lower(): csvRowString += song.trackId elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',', '') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
def getInfo(files): data = [] build_str = '' with open(sys.argv[1], 'r') as f: contents = f.read() c = contents.split() f.close() print("creating csv with following fields:" + contents) for i in c: build_str = build_str + i + ',' build_str = build_str[:-1] build_str = build_str + '\n' for fil in files: curFile = getters.open_h5_file_read(fil) d2 = {} get_table = {'track_id': getters.get_track_id(curFile), 'segments_pitches': getters.get_segments_pitches(curFile), 'time_signature_confidence': getters.get_time_signature_confidence(curFile), 'song_hotttnesss': getters.get_song_hotttnesss(curFile), 'artist_longitude': getters.get_artist_longitude(curFile), 'tatums_confidence': getters.get_tatums_confidence(curFile), 'num_songs': getters.get_num_songs(curFile), 'duration': getters.get_duration(curFile), 'start_of_fade_out': getters.get_start_of_fade_out(curFile), 'artist_name': getters.get_artist_name(curFile), 'similar_artists': getters.get_similar_artists(curFile), 'artist_mbtags': getters.get_artist_mbtags(curFile), 'artist_terms_freq': getters.get_artist_terms_freq(curFile), 'release': getters.get_release(curFile), 'song_id': getters.get_song_id(curFile), 'track_7digitalid': getters.get_track_7digitalid(curFile), 'title': getters.get_title(curFile), 'artist_latitude': getters.get_artist_latitude(curFile), 'energy': getters.get_energy(curFile), 'key': getters.get_key(curFile), 'release_7digitalid': getters.get_release_7digitalid(curFile), 'artist_mbid': getters.get_artist_mbid(curFile), 'segments_confidence': getters.get_segments_confidence(curFile), 'artist_hotttnesss': getters.get_artist_hotttnesss(curFile), 'time_signature': getters.get_time_signature(curFile), 'segments_loudness_max_time': getters.get_segments_loudness_max_time(curFile), 'mode': getters.get_mode(curFile), 'segments_loudness_start': getters.get_segments_loudness_start(curFile), 'tempo': getters.get_tempo(curFile), 'key_confidence': getters.get_key_confidence(curFile), 'analysis_sample_rate': getters.get_analysis_sample_rate(curFile), 'bars_confidence': getters.get_bars_confidence(curFile), 'artist_playmeid': getters.get_artist_playmeid(curFile), 'artist_terms_weight': getters.get_artist_terms_weight(curFile), 'segments_start': getters.get_segments_start(curFile), 'artist_location': getters.get_artist_location(curFile), 'loudness': getters.get_loudness(curFile), 'year': getters.get_year(curFile), 'artist_7digitalid': getters.get_artist_7digitalid(curFile), 'audio_md5': getters.get_audio_md5(curFile), 'segments_timbre': getters.get_segments_timbre(curFile), 'mode_confidence': getters.get_mode_confidence(curFile), 'end_of_fade_in': getters.get_end_of_fade_in(curFile), 'danceability': getters.get_danceability(curFile), 'artist_familiarity': getters.get_artist_familiarity(curFile), 'artist_mbtags_count': getters.get_artist_mbtags_count(curFile), 'tatums_start': getters.get_tatums_start(curFile), 'artist_id': getters.get_artist_id(curFile), 'segments_loudness_max': getters.get_segments_loudness_max(curFile), 'bars_start': getters.get_bars_start(curFile), 'beats_start': getters.get_beats_start(curFile), 'artist_terms': getters.get_artist_terms(curFile), 'sections_start': getters.get_sections_start(curFile), 'beats_confidence': getters.get_beats_confidence(curFile), 'sections_confidence': getters.get_sections_confidence(curFile)} tid = fil.split('/')[-1].split('.')[0] # print(c) for i in c: if i in get_table: d2[i] = get_table[i] d2[i] = str(d2[i]).replace('\n','') build_str = build_str + d2[i] + ',' else: print('error: unspecified field') exit(0) build_str = build_str[:-1] # print(build_str[:-1]) build_str = build_str + '\n' curFile.close() build_str = build_str.replace('b','').replace("'",'').replace('"','') return (build_str)