def get_attribute(files): array = [] count = 0 for f in files: temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp.append(hdf5_getters.get_title(h5)) temp.append(hdf5_getters.get_artist_name(h5)) temp = np.nan_to_num(temp) array.append(temp) # if count%100 ==0: # print(array[count-100:count-1]) # kmean.fit(array[count-100:count-1]) h5.close() return array
def parse_songs(directory): global count global MAX_SONGS for filename in os.listdir(directory): if count >= MAX_SONGS: return file_path = os.path.join(directory, filename) if os.path.isdir(file_path): parse_songs(file_path) else: count += 1 if count % 100 == 0: print('Parsed ' + str(count) + ' songs') with hdf5_getters.open_h5_file_read(file_path) as h5: for i in range(hdf5_getters.get_num_songs(h5)): title = hdf5_getters.get_title(h5, i).decode('UTF-8') year = hdf5_getters.get_year(h5, i).item() danceability = hdf5_getters.get_danceability(h5, i).item() tags = hdf5_getters.get_artist_mbtags(h5, i).tolist() genres = [tag.decode('UTF-8') for tag in tags] tempo = hdf5_getters.get_tempo(h5, i).item() song = { 'title': title, 'year': year, 'danceability': danceability, 'genres': genres, 'tempo': tempo } song = os.path.splitext(filename) with open( "/home/ubuntu/million_songs/parsed_data/" + song[0] + '.json', 'w') as fp: json.dump(song, fp)
def load(self, msd_summary_file): self.msd_summary_file = msd_summary_file self.check() self.h5_fd = hdf5_getters.open_h5_file_read(self.msd_summary_file) self.num_songs = int(hdf5_getters.get_num_songs(self.h5_fd)) logger.debug("Found {} songs in summary file".format(self.num_songs))
def traverseAndWrite(root, genreDirs, genreKeys): if not isfile(root): for f in listdir(root): traverseAndWrite(root + "/" + f,genreDirs, genreKeys) else: h5 = hdf5_getters.open_h5_file_read(root) numOfSongs = hdf5_getters.get_num_songs(h5) for index in range(numOfSongs): tags = hdf5_getters.get_artist_mbtags(h5,index) # print tags artist = hdf5_getters.get_artist_name(h5,index) songName = hdf5_getters.get_title(h5,index) segmentTimbre = hdf5_getters.get_segments_timbre(h5,index) segmentPitches = hdf5_getters.get_segments_pitches(h5,index) if notValidSong(tags, artist, songName, segmentTimbre, segmentPitches): h5.close() continue for genre in genreKeys: if genreInTags(genre,tags): song = {} song['genre'] = genre song['artist_name'] = artist song['song_title'] = songName song['segments_pitches'] = segmentPitches.tolist() song['segments_timbre'] = segmentTimbre.tolist() valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) songName = ''.join(c for c in songName if c in valid_chars) artist = ''.join(c for c in artist if c in valid_chars) fd = open(genreDirs[genre]+"/"+artist+"--"+songName+".json",'a') writeToDescriptor(fd,song) fd.close() h5.close()
def csv_convert(basedir, csv_filename): '''Function to convert all the information in a h5 file to a csv file, one song per line Inputs: basedir, a string of subdirectory within the current directory csv_filename, a filename where all the information will be written down''' t1=time.time() cnt=0 with open("/home/ec2-user/{}".format(csv_filename), "w") as csv_file: csv_file.write(header) for root, dirs, files in os.walk(basedir): files=glob.glob(os.path.join(root, '*.h5')) for f in files: h5=gt.open_h5_file_read(f) ##each h5 file actually has multiple songs num_songs=gt.get_num_songs(h5) for j in range(int(num_songs)): if validate_song(h5, j): cnt+=1 csv_file.write(h5_to_csv_fields(h5,j)) #sanity check to make sure this is working if cnt%10==0: print("{} files csved thus far".format(cnt)) #remember to close your files or you run out of memory... h5.close() t2=time.time() print ('directory {} csv-ed in:'.format(basedir), strtimedelta(t1,t2))
def get(getters, h5file): # sanity check if not os.path.isfile(h5file): print 'ERROR: file', h5file, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(h5file) numSongs = hdf5_getters.get_num_songs(h5) songidx = 0 if songidx >= numSongs: print 'ERROR: file contains only',numSongs h5.close() sys.exit(0) line = dict() for getter in getters: try: res = hdf5_getters.__getattribute__('get_' + getter)(h5,songidx) except AttributeError, e: print e if res.__class__.__name__ == 'ndarray': # print getter[4:]+": shape =",res.shape # How to put multidimensional values into file. # Try to put only mean of the values etc... print 'Ignoring....' else: # print getter[4:]+":",res line[getter] = res
def h5file_data(h5file): with lock: h5 = hdf5_getters.open_h5_file_read(h5file) num_songs = hdf5_getters.get_num_songs(h5) for i in range(num_songs): yield get_h5_data(h5, i) h5.close()
def add_to_database(con, full_path): data = h5.open_h5_file_read(full_path) number_of_songs = h5.get_num_songs(data) for song_index in xrange(0, number_of_songs): artist_id = get_artist_id(con, h5.get_artist_mbid(data, song_index)) if artist_id == -1: artist_id = add_data_to_artists_table(con, data, song_index) add_data_to_artists_rel_table(con, data, song_index, artist_id) add_data_to_songs_table(con, data, artist_id) data.close()
def h5_to_df(hdf5_file): df = [] cols = song_funs.keys() for i in xrange(hdf5_getters.get_num_songs(hdf5_file)): row = [] for col in cols: row.append(song_funs[col](hdf5_file, i)) df.append(row) return pd.DataFrame(df, columns=cols)
def create_idix(h5, msd_path): ''' Creates indices for the million songs. The reason is because songs are accessed as indices from 0 to (the maximum number of songs - 1) and not SongIDs. ''' import hdf5_getters totsng = hdf5_getters.get_num_songs(h5) idixdic = dict() for count in range(0, totsng): idixdic[hdf5_getters.get_song_id(h5, count)] = count idixfile = open("songidix.txt", "wb") pickle.dump(idixdic, idixfile) idixfile.close()
def traverseAndWrite(root, genreKeys, counts): if not isfile(root): for f in listdir(root): traverseAndWrite(root + "/" + f,genreKeys,counts) else: h5 = hdf5_getters.open_h5_file_read(root) numOfSongs = hdf5_getters.get_num_songs(h5) for index in range(numOfSongs): tags = hdf5_getters.get_artist_mbtags(h5,index) # print tags for genre in genreKeys: if genreInTags(genre,tags): counts[genre] +=1 print counts h5.close()
def main(): path = '/Users/maxenchung/Desktop/untitled/MillionSongSubset/data/' directories1 = os.listdir(path) for directory1 in directories1: #A directories2 = os.listdir(path + directory1) for directory2 in directories2: #A/A directories3 = os.listdir(path + directory1 + '/' + directory2) for directory3 in directories3: #A/A/A file_path = path + directory1 + '/' + directory2 + '/' + directory3 + '/' files = os.listdir(file_path) for filename in files: h5_file = h5g.open_h5_file_read(file_path + filename) num_songs = h5g.get_num_songs(h5_file) print(file_path + filename) for song_index in range(0, num_songs): print(build_h5_dictionary(h5_file, song_index))
def get_csv_rows(h5file,fieldType='all'): h5 = hdf5_getters.open_h5_file_read(h5file) output = [] for i in range(hdf5_getters.get_num_songs(h5)): data = get_csv_h5_data(h5,i,fieldType) out = "\",\"".join(data) out = out.replace("\n","") out = out.replace("\r","") out = "\"" + out + "\"" output.append(out) output.append("\n") h5.close() return ''.join(output)
def get_song_info(song_path, pickle_path): #Create a dictionary with fields and dump in pickle data = {} data['pickle_id'] = get_song_id(song_path) #print data['pickle_id'] # get params hdf5path = song_path songidx = 0 onegetter = '' # if len(sys.argv) > 2: # songidx = int(sys.argv[2]) # if len(sys.argv) > 3: # onegetter = sys.argv[3] # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file', hdf5path, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) numSongs = hdf5_getters.get_num_songs(h5) if songidx >= numSongs: print 'ERROR: file contains only', numSongs h5.close() sys.exit(0) # get all getters getters = get_modified_getters() #print getters # print them for getter in getters: try: res = hdf5_getters.__getattribute__(getter)(h5, songidx) except AttributeError, e: if summary: continue else: print e print 'forgot -summary flag? specified wrong getter?' if res.__class__.__name__ == 'ndarray': print getter[4:] + ": shape =", res.shape else: data[getter[4:]] = str(res)
def _handle_h5_file(self, filename): h5 = hdf5_getters.open_h5_file_read(filename) num_songs = hdf5_getters.get_num_songs(h5) if not self.getters: self.getters = self._get_getters(h5) getter_row = [getter[4:] for getter in self.getters] self.writer.writerow(getter_row) for i in range(num_songs): result = [] for getter in self.getters: hdf5_getter = getattr(hdf5_getters, getter) value = hdf5_getter(h5, i) if value.__class__.__name__ == 'ndarray': # Special case for ndarray types value = json.dumps(value.tolist()) result.append(value) self.writer.writerow(result) h5.close()
def _handle_h5_file(self, filename): h5 = hdf5_getters.open_h5_file_read(filename) num_songs = hdf5_getters.get_num_songs(h5) if not self.getters: self.getters = self._get_getters(h5) getter_row = [getter[4:] for getter in self.getters] self.writer.writerow(getter_row) for i in xrange(num_songs): result = [] for getter in self.getters: hdf5_getter = getattr(hdf5_getters, getter) value = hdf5_getter(h5, i) if value.__class__.__name__ == 'ndarray': # Special case for ndarray types value = json.dumps(value.tolist()) result.append(value) self.writer.writerow(result) h5.close()
def parse_file(h5): data = [] out = open("out/msd_orig.txt", "a") num_songs = hdf5_getters.get_num_songs(h5) for i in range(990000, num_songs): item = {} item["msid"] = H5.get_track_id(h5, i) item["artist_name"] = H5.get_artist_name(h5, i) item["energy"] = H5.get_energy(h5, i) item["loudness"] = H5.get_loudness(h5, i) item["tempo"] = H5.get_tempo(h5, i) item["year"] = H5.get_year(h5, i) data.append(item) if "\t" in item["artist_name"]: print "!Warning! Tab found in artist name:", item["artist_name"] if i > 990000 and i % 10000 == 0: print "Writing %d to %d." % (i - 10000, i) output = "" for j in range(0, len(data)): s = data[j] output += "%s\t%s\t%s\t%s\t%s\t%s\n" % (\ s["msid"], \ s["artist_name"], \ s["energy"], \ s["loudness"], \ s["tempo"], \ s["year"]) out.write(output) data = [] output = "" for j in range(0, len(data)): s = data[j] output += "%s\t%s\t%s\t%s\t%s\t%s\n" % (\ s["msid"], \ s["artist_name"], \ s["energy"], \ s["loudness"], \ s["tempo"], \ s["year"]) out.write(output) out.close()
def get_attribute(f): temp = [] count += 1 print(f) h5 = hdf5_getters.open_h5_file_read(f) temp.append(hdf5_getters.get_num_songs(h5)) temp.append(hdf5_getters.get_artist_familiarity(h5)) temp.append(hdf5_getters.get_artist_hotttnesss(h5)) temp.append(hdf5_getters.get_danceability(h5)) temp.append(hdf5_getters.get_energy(h5)) temp.append(hdf5_getters.get_key(h5)) temp.append(hdf5_getters.get_key_confidence(h5)) temp.append(hdf5_getters.get_loudness(h5)) temp.append(hdf5_getters.get_mode(h5)) temp.append(hdf5_getters.get_mode_confidence(h5)) temp.append(hdf5_getters.get_tempo(h5)) temp.append(hdf5_getters.get_time_signature(h5)) temp.append(hdf5_getters.get_time_signature_confidence(h5)) temp = np.nan_to_num(temp) array.append(temp) h5.close()
def main(): path = '/Users/maxenchung/Desktop/untitled/MillionSongSubset/data/' directories1 = os.listdir(path) writeList = list() for directory1 in directories1: #A directories2 = os.listdir(path + directory1) for directory2 in directories2: #A/A directories3 = os.listdir(path + directory1 + '/' + directory2) for directory3 in directories3: #A/A/A file_path = path + directory1 + '/' + directory2 + '/' + directory3 + '/' files = os.listdir(file_path) for filename in files: with h5g.open_h5_file_read(file_path + filename) as h5_file: num_songs = h5g.get_num_songs(h5_file) print(file_path + filename) for song_index in range(0, num_songs): my_dict = build_h5_dictionary(h5_file, song_index) newList = clean_h5_dictionary(my_dict) #See if newList is empty, if it is not then add it to writelist to be put into the csv file if not newList: continue else: writeList.append(newList) #Get the keys from my_dict and then sort them and store the sorted keys in k k = my_dict.keys() k = sorted(k) print(writeList) #First write the keys to the output.csv file with open('output.csv', 'w') as csvfile: mywriter = csv.writer(csvfile) mywriter.writerow(k) #Now write the different songs values to the output.csv file with open('output.csv', 'a') as csvfile: mywriter = csv.writer(csvfile) mywriter.writerows(writeList)
sys.path.append(os.path.join(msd_code_path,'PythonSrc')) # imports specific to the MSD import hdf5_getters as GETTERS cnt = 0 loops = 0 for alpha in string.ascii_uppercase : for root, dirs, files in os.walk('/mnt/million-songs/data/'+alpha): files = glob.glob(os.path.join(root,'*'+'.h5')) for f in files : h5 = GETTERS.open_h5_file_read(f) num_songs = GETTERS.get_num_songs(h5) print f, num_songs for i in range(num_songs): analysis_sample_rate = GETTERS.get_analysis_sample_rate(h5, i) artist_7digitalid = GETTERS.get_artist_7digitalid(h5, i) artist_familiarity = GETTERS.get_artist_familiarity(h5, i) artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5, i) artist_id = GETTERS.get_artist_id(h5, i) artist_latitude = GETTERS.get_artist_latitude(h5, i) artist_location = GETTERS.get_artist_location(h5, i) artist_longitude = GETTERS.get_artist_longitude(h5, i) artist_mbid = GETTERS.get_artist_mbid(h5, i) artist_mbtags = ','.join(str(e) for e in GETTERS.get_artist_mbtags(h5, i)) # array artist_mbtags_count = ','.join(str(e) for e in GETTERS.get_artist_mbtags_count(h5, i)) # array artist_name = GETTERS.get_artist_name(h5, i)
# get params hdf5path = sys.argv[1] songidx = 0 if len(sys.argv) > 2: songidx = int(sys.argv[2]) onegetter = '' if len(sys.argv) > 3: onegetter = sys.argv[3] # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file',hdf5path,'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) numSongs = hdf5_getters.get_num_songs(h5) if songidx >= numSongs: print 'ERROR: file contains only',numSongs h5.close() sys.exit(0) # get all getters getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case if onegetter == 'num_songs' or onegetter == 'get_num_songs': getters = [] elif onegetter != '': if onegetter[:4] != 'get_': onegetter = 'get_' + onegetter try: getters.index(onegetter)
def transfer(h5path, matpath=None, force=False): """ Transfer an HDF5 song file (.h5) to a matfile (.mat) If there are more than one song in the HDF5 file, each field name gets a number happened: 1, 2, 3, ...., numfiles PARAM h5path - path to the HDF5 song file matpath - path to the new matfile, same as HDF5 path with a different extension by default force - if True and matfile exists, overwrite RETURN True if the file was transfered, False if there was a problem. Could also raise an IOException NOTE All the data has to be loaded in memory! be careful if one file contains tons of songs! """ # sanity checks if not os.path.isfile(h5path): print 'path to HF5 files does not exist:', h5path return False if not os.path.splitext(h5path)[1] == '.h5': print 'expecting a .h5 extension for file:', h5path return False # check matfile if matpath is None: matpath = os.path.splitext(h5path)[0] + '.mat' if os.path.exists(matpath): if force: print 'overwriting file:', matpath else: # print 'matfile',matpath,'already exists (delete or force):' return False # get all getters! we assume that all we need is in hdf5_getters.py # further assume that they have the form get_blablabla and that's the # only thing that has that form getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case # open h5 file h5 = hdf5_getters.open_h5_file_read(h5path) # transfer nSongs = hdf5_getters.get_num_songs(h5) matdata = { 'transfer_note': 'transferred on ' + time.ctime() + ' from file: ' + h5path } try: # iterate over songs for songidx in xrange(nSongs): # iterate over getter for getter in getters: gettername = getter[4:] if nSongs > 1: gettername += str(songidx + 1) data = hdf5_getters.__getattribute__(getter)(h5, songidx) matdata[gettername] = data except MemoryError: print 'Memory Error with file:', h5path print 'All data has to be loaded in memory before being saved as matfile' print 'Is this an aggregated / summary file with tons of songs?' print 'This code is optimized for files containing one song,' print 'but write me an email! (TBM)' raise finally: # close h5 h5.close() # create sio.savemat(matpath, matdata) # all good return True
'track_7digitalid', 'year') sep = '\t' metadata.write(sep.join(list_attr1) + '\n') getters1 = get_getters(list_attr1) #run through each .h5 file contained in the folder progression = 0 interval = 0 for folder, subfolders, files in os.walk(data_folder): for f in files: if f.endswith('.h5') and not f.startswith('._'): #open the hdf5 file h5 = hdf5_getters.open_h5_file_read(folder + '/' + f) #add one entry line in the database per row contained in the file for song_nb in range(hdf5_getters.get_num_songs(h5)): write_line(metadata, getters1, h5, song_nb) progression += 1 interval += 1 if interval == 1000: print(progression) interval = 0 h5.close() metadata.close() print('nb songs = %d' % progression) elapsed_time = time.time() - start_time print('elapsed time = ' + str(elapsed_time) + ' sec')
import csv import math import hdf5_getters import operator print("loading...") h5 = hdf5_getters.open_h5_file_read("files/msd_summary_file.h5") data = [] length = hdf5_getters.get_num_songs(h5) print("number of songs = ",length) count = 0; for i in range(0,length): tmp = []; if hdf5_getters.get_year(h5,songidx=i) == 0 : continue; #if math.isnan(hdf5_getters.get_artist_latitude(h5,songidx=i)) and hdf5_getters.get_artist_location(h5,songidx=i) =='': # continue; count+=1; tmp.append(str(hdf5_getters.get_track_id(h5,songidx=i)).replace("b'","").replace("'","")); tmp.append(hdf5_getters.get_year(h5,songidx=i)); #0 tmp.append(hdf5_getters.get_song_hotttnesss(h5,songidx=i)); #1 tmp.append(str(hdf5_getters.get_title(h5,songidx=i)).replace("b'","").replace("'","")); #2 tmp.append(str(hdf5_getters.get_artist_id(h5,songidx=i)).replace("b'","").replace("'","")); #3 tmp.append(hdf5_getters.get_artist_latitude(h5,songidx=i)); #4 tmp.append(hdf5_getters.get_artist_longitude(h5,songidx=i)); #5 tmp.append(str(hdf5_getters.get_artist_location(h5,songidx=i)).replace("b'","").replace("'","")); #6 tmp.append(str(hdf5_getters.get_artist_name(h5,songidx=i)).replace("b'","").replace("'","")); #7 tmp.append(str(hdf5_getters.get_song_id(h5,songidx=i)).replace("b'","").replace("'",""));
#print "Calc: ", song.hotness, " + ", song.chart_score retval = 0.3*float(song.hotness) + 0.7*float(song.chart_score) return retval class Song: name = "" artist = "" year = 0; #Quantity in given year hotness = 0; chart_score = 0; pop_score = 0.0; h5 = h5get.open_h5_file_read("subset_msd_summary_file.h5") numSongs = h5get.get_num_songs(h5) #all_chart_info = {} """ with open('tsort-chart-2-2-0007.csv', 'rb') as csvfile: reader = csv.reader(csvfile) reader.next(); for row in reader: #print row artist = row[0] title = row[1] score = row[4] all_chart_info[(artist,title)] = score; """
# get params hdf5path = sys.argv[1] songidx = 0 if len(sys.argv) > 2: songidx = int(sys.argv[2]) onegetter = '' if len(sys.argv) > 3: onegetter = sys.argv[3] # sanity check if not os.path.isfile(hdf5path): print 'ERROR: file', hdf5path, 'does not exist.' sys.exit(0) h5 = hdf5_getters.open_h5_file_read(hdf5path) numSongs = hdf5_getters.get_num_songs(h5) if songidx >= numSongs: print 'ERROR: file contains only', numSongs h5.close() sys.exit(0) # get all getters getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case if onegetter == 'num_songs' or onegetter == 'get_num_songs': getters = [] elif onegetter != '': if onegetter[:4] != 'get_': onegetter = 'get_' + onegetter try: getters.index(onegetter)
def get_all_rows(basedir, ext='.h5'): rows = [] for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: # print(os.path.join(root, f)) h5 = hdf5_getters.open_h5_file_read(f) num_songs = hdf5_getters.get_num_songs(h5) # print(num_songs) for i in range(num_songs): print(i) obj = {} obj['artist_name'] = hdf5_getters.get_artist_name( h5, i).decode('UTF-8') obj['artist_familiarity'] = hdf5_getters.get_artist_familiarity( h5, i) obj['artist_hotness'] = hdf5_getters.get_artist_hotttnesss( h5, i) obj['artist_id'] = hdf5_getters.get_artist_id( h5, i).decode('UTF-8') # obj['artist_mbid']=hdf5_getters.get_artist_mbid(h5,i).decode('UTF-8') obj['artist_playmeid'] = hdf5_getters.get_artist_playmeid( h5, i) obj['artist_7digitalid'] = hdf5_getters.get_artist_7digitalid( h5, i) # obj['artist_latitude']=hdf5_getters.get_artist_latitude(h5,i) # obj['artist_longitude']=hdf5_getters.get_artist_longitude(h5,i) # obj['artist_location']=hdf5_getters.get_artist_location(h5,i).decode('UTF-8') obj['artist_name'] = hdf5_getters.get_artist_name( h5, i).decode('UTF-8') obj['release'] = hdf5_getters.get_release(h5, i).decode('UTF-8') obj['song_hotttnesss'] = hdf5_getters.get_song_hotttnesss( h5, i) obj['title'] = hdf5_getters.get_title(h5, i).decode('UTF-8') # obj['artist_terms']=hdf5_getters.get_artist_terms(h5) # obj['artist_terms_freq']=hdf5_getters.get_artist_terms_freq(h5) # obj['artist_terms_weight']=hdf5_getters.get_artist_terms_weight(h5) # obj['audio_md5']=hdf5_getters.get_audio_md5(h5).decode('UTF-8') obj['danceability'] = hdf5_getters.get_danceability(h5, i) obj['duration'] = hdf5_getters.get_duration(h5, i) obj['end_of_fade_in'] = hdf5_getters.get_end_of_fade_in(h5, i) obj['energy'] = hdf5_getters.get_energy(h5, i) obj['key'] = hdf5_getters.get_key(h5, i) obj['key_confidence'] = hdf5_getters.get_key_confidence(h5, i) obj['loudness'] = hdf5_getters.get_loudness(h5, i) obj['mode'] = hdf5_getters.get_mode(h5, i) # obj['start_of_fade_out']=hdf5_getters.get_start_of_fade_out(h5) obj['tempo'] = hdf5_getters.get_tempo(h5, i) obj['time_signature'] = hdf5_getters.get_time_signature(h5, i) # obj['time_signature_confidence']=hdf5_getters.get_time_signature_confidence(h5) obj['track_id'] = hdf5_getters.get_track_id(h5, i).decode('UTF-8') # obj['segments_start']=hdf5_getters.get_segments_start(h5) # obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5) # obj['segments_pitches']=hdf5_getters.get_segments_pitches(h5) # obj['segments_timbre']=hdf5_getters.get_segments_timbre(h5) # obj['segments_loudness_max']=hdf5_getters.get_segments_loudness_max(h5) # obj['segments_loudness_max_time']=hdf5_getters.get_segments_loudness_max_time(h5) # obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5) # obj['segments_loudness_start']=hdf5_getters.get_segments_loudness_start(h5) # obj['sections_start']=hdf5_getters.get_sections_start(h5) # obj['sections_confidence']=hdf5_getters.get_sections_confidence(h5) # obj['beats_start']=hdf5_getters.get_beats_start(h5) # obj['beats_confidence']=hdf5_getters.get_beats_confidence(h5) # obj['bars_start']=hdf5_getters.get_bars_start(h5) # obj['bars_confidence']=hdf5_getters.get_bars_confidence(h5) # obj['tatums_start']=hdf5_getters.get_tatums_start(h5) # obj['artist_mbtags']=hdf5_getters.get_artist_mbtags(h5) # obj['artist_mbtags_count']=hdf5_getters.get_artist_mbtags_count(h5) obj['year'] = hdf5_getters.get_year(h5, i) rows.append(obj) h5.close() return rows
# Ubuntu: you can change the environment variable PYTHONPATH # in your .bashrc file so you do not have to type these lines sys.path.append(os.path.join(msd_code_path, 'PythonSrc')) # imports specific to the MSD import hdf5_getters as GETTERS cnt = 0 loops = 0 for alpha in string.ascii_uppercase: for root, dirs, files in os.walk('/mnt/million-songs/data/' + alpha): files = glob.glob(os.path.join(root, '*' + '.h5')) for f in files: h5 = GETTERS.open_h5_file_read(f) num_songs = GETTERS.get_num_songs(h5) print f, num_songs for i in range(num_songs): analysis_sample_rate = GETTERS.get_analysis_sample_rate(h5, i) artist_7digitalid = GETTERS.get_artist_7digitalid(h5, i) artist_familiarity = GETTERS.get_artist_familiarity(h5, i) artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5, i) artist_id = GETTERS.get_artist_id(h5, i) artist_latitude = GETTERS.get_artist_latitude(h5, i) artist_location = GETTERS.get_artist_location(h5, i) artist_longitude = GETTERS.get_artist_longitude(h5, i) artist_mbid = GETTERS.get_artist_mbid(h5, i) artist_mbtags = ','.join( str(e) for e in GETTERS.get_artist_mbtags(h5, i)) # array artist_mbtags_count = ','.join(
def parse_aggregate_songs(file_name,file_name2,artist_map): """ Given an aggregate filename and artist_map in the format {artist_name: {data pertaining to artist}} """ """ TODO: -this function goes through each song, if artist not in there, add all data necesary and add first song info. else update any specific song info -song info is a map from attributename:[values] """ #artist_map = {} h5 = hdf5_getters.open_h5_file_read(file_name) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file...' for i in range(numSongs): artist_name = hdf5_getters.get_artist_name(h5,i) #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr) h5 = hdf5_getters.open_h5_file_read(file_name2) numSongs = hdf5_getters.get_num_songs(h5) print 'Parsing song file2...' for i in range(numSongs): song_id = hdf5_getters.get_track_id(h5,i) artist_name = hdf5_getters.get_artist_name(h5,i) if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']: continue #Filter location longi = hdf5_getters.get_artist_longitude(h5,i) lat = hdf5_getters.get_artist_latitude(h5,i) loc = hdf5_getters.get_artist_location(h5,i) if math.isnan(lat) or math.isnan(longi): #skip if no location continue #filter year yr = hdf5_getters.get_year(h5,i) if yr == 0: #skip if no year continue #filter hotttness and familiarity familiarity = hdf5_getters.get_artist_familiarity(h5,i) hotttness = hdf5_getters.get_artist_hotttnesss(h5,i) if familiarity<=0.0 or hotttness<=0.0: #skip if no hotttness or familiarity computations continue #TODO:MAYBE filter on dance and energy timbre = hdf5_getters.get_segments_timbre(h5,i) #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each if not artist_name in artist_map: #have not encountered the artist yet, so populate new map sub_map = {} sub_map['artist_familiarity'] = familiarity sub_map['artist_hotttnesss'] = hotttness sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i) #longi = hdf5_getters.get_artist_longitude(h5,i) #lat = hdf5_getters.get_artist_latitude(h5,i) #longi = None if math.isnan(longi) else longi #lat = None if math.isnan(lat) else lat sub_map['artist_latitude'] = lat sub_map['artist_longitude'] = longi sub_map['artist_location'] = loc sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i) #TODO:see if should weight by freq or weight for if the term matches one of the feature terms sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i)) sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i)) #song-sepcific data #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA: #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy sub_map['danceability'] = [dance] sub_map['duration'] = [hdf5_getters.get_duration(h5,i)] sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)] sub_map['energy'] = [energy] #since each song has a key, ask if feature for keys should be num of songs that appear in that key or #just binary if any of their songs has that key or just be avg of songs with that key #same for mode, since its either major or minor...should it be count or avg.? sub_map['key'] = [hdf5_getters.get_key(h5,i)] sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)] sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot sub_map['song_hotttnesss'] = [s_hot] sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)] sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)] #should time signature be count as well? binary? sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)] sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)] #should year be binary since they can have many songs across years and should it be year:count sub_map['year'] = [yr] artist_map[artist_name] = sub_map else: #artist already exists, so get its map and update song fields dance = hdf5_getters.get_danceability(h5,i) dance = None if dance == 0.0 else dance energy = hdf5_getters.get_energy(h5,i) energy = None if energy == 0.0 else energy artist_map[artist_name]['danceability'].append(dance) artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i)) artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i)) artist_map[artist_name]['energy'].append(energy) artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i)) artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i)) artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1 s_hot = hdf5_getters.get_song_hotttnesss(h5,i) s_hot = None if math.isnan(s_hot) else s_hot artist_map[artist_name]['song_hotttnesss'].append(s_hot) artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i)) artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i)) #should time signature be count as well? binary? artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i)) artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i)) #should year be binary since they can have many songs across years and should it be year:count artist_map[artist_name]['year'].append(yr)
def hd5_single_random_file_parser(): # Open an h5 file in read mode h5 = hdf5_getters.open_h5_file_read( '/home/skalogerakis/Documents/MillionSong/MillionSongSubset/A/M/G/TRAMGDX12903CEF79F.h5' ) function_tracker = filter( lambda x: x.startswith('get'), hdf5_getters.__dict__.keys()) # Detects all the getter functions for f in function_tracker: # Print everything in function tracker print(f) # First effort to check what each field contains. print() # 55 available fields (exluding number of songs fields) print("Num of songs -- ", hdf5_getters.get_num_songs(h5)) # One song per file print("Title -- ", hdf5_getters.get_title(h5)) # Print the title of a specific h5 file print("Artist familiarity -- ", hdf5_getters.get_artist_familiarity(h5)) print("Artist hotness -- ", hdf5_getters.get_artist_hotttnesss(h5)) print("Artist ID -- ", hdf5_getters.get_artist_id(h5)) print("Artist mbID -- ", hdf5_getters.get_artist_mbid(h5)) print("Artist playmeid -- ", hdf5_getters.get_artist_playmeid(h5)) print("Artist 7DigitalID -- ", hdf5_getters.get_artist_7digitalid(h5)) print("Artist latitude -- ", hdf5_getters.get_artist_latitude(h5)) print("Artist longitude -- ", hdf5_getters.get_artist_longitude(h5)) print("Artist location -- ", hdf5_getters.get_artist_location(h5)) print("Artist Name -- ", hdf5_getters.get_artist_name(h5)) print("Release -- ", hdf5_getters.get_release(h5)) print("Release 7DigitalID -- ", hdf5_getters.get_release_7digitalid(h5)) print("Song ID -- ", hdf5_getters.get_song_id(h5)) print("Song Hotness -- ", hdf5_getters.get_song_hotttnesss(h5)) print("Track 7Digital -- ", hdf5_getters.get_track_7digitalid(h5)) print("Similar artists -- ", hdf5_getters.get_similar_artists(h5)) print("Artist terms -- ", hdf5_getters.get_artist_terms(h5)) print("Artist terms freq -- ", hdf5_getters.get_artist_terms_freq(h5)) print("Artist terms weight -- ", hdf5_getters.get_artist_terms_weight(h5)) print("Analysis sample rate -- ", hdf5_getters.get_analysis_sample_rate(h5)) print("Audio md5 -- ", hdf5_getters.get_audio_md5(h5)) print("Danceability -- ", hdf5_getters.get_danceability(h5)) print("Duration -- ", hdf5_getters.get_duration(h5)) print("End of Fade -- ", hdf5_getters.get_end_of_fade_in(h5)) print("Energy -- ", hdf5_getters.get_energy(h5)) print("Key -- ", hdf5_getters.get_key(h5)) print("Key Confidence -- ", hdf5_getters.get_key_confidence(h5)) print("Loudness -- ", hdf5_getters.get_loudness(h5)) print("Mode -- ", hdf5_getters.get_mode(h5)) print("Mode Confidence -- ", hdf5_getters.get_mode_confidence(h5)) print("Start of fade out -- ", hdf5_getters.get_start_of_fade_out(h5)) print("Tempo -- ", hdf5_getters.get_tempo(h5)) print("Time signature -- ", hdf5_getters.get_time_signature(h5)) print("Time signature confidence -- ", hdf5_getters.get_time_signature_confidence(h5)) print("Track ID -- ", hdf5_getters.get_track_id(h5)) print("Segments Start -- ", hdf5_getters.get_segments_start(h5)) print("Segments Confidence -- ", hdf5_getters.get_segments_confidence(h5)) print("Segments Pitches -- ", hdf5_getters.get_segments_pitches(h5)) print("Segments Timbre -- ", hdf5_getters.get_segments_timbre(h5)) print("Segments Loudness max -- ", hdf5_getters.get_segments_loudness_max(h5)) print("Segments Loudness max time-- ", hdf5_getters.get_segments_loudness_max_time(h5)) print("Segments Loudness start -- ", hdf5_getters.get_segments_loudness_start(h5)) print("Sections start -- ", hdf5_getters.get_sections_start(h5)) print("Sections Confidence -- ", hdf5_getters.get_sections_confidence(h5)) print("Beats start -- ", hdf5_getters.get_beats_start(h5)) print("Beats confidence -- ", hdf5_getters.get_beats_confidence(h5)) print("Bars start -- ", hdf5_getters.get_bars_start(h5)) print("Bars confidence -- ", hdf5_getters.get_bars_confidence(h5)) print("Tatums start -- ", hdf5_getters.get_tatums_start(h5)) print("Tatums confidence -- ", hdf5_getters.get_tatums_confidence(h5)) print("Artist mbtags -- ", hdf5_getters.get_artist_mbtags(h5)) print("Artist mbtags count -- ", hdf5_getters.get_artist_mbtags_count(h5)) print("Year -- ", hdf5_getters.get_year(h5)) fields = ['Title', 'Artist ID'] with open('Tester2.csv', 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile, delimiter=';') # writing the fields csv_writer.writerow(fields) # writing the data rows csv_writer.writerow( [hdf5_getters.get_title(h5), hdf5_getters.get_artist_id(h5)]) h5.close() # close h5 when completed in the end
import hdf5_getters h5 = hdf5_getters.open_h5_file_read("data/msd_summary_file.h5") for i in range(hdf5_getters.get_num_songs(h5)): print(hdf5_getters.get_song_id(h5, i)) h5.close()
def main(): outputFile1 = open('SongCSV.csv', 'w') csvRowString = "" ################################################# #if you want to prompt the user for the order of attributes in the csv, #leave the prompt boolean set to True #else, set 'prompt' to False and set the order of attributes in the 'else' #clause prompt = False ################################################# if prompt == True: while prompt: prompt = False csvAttributeString = raw_input( "\n\nIn what order would you like the colums of the CSV file?\n" + "Please delineate with commas. The options are: " + "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude," + " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," + " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" + "For example, you may write \"Title, Tempo, Duration\"...\n\n" + "...or exit by typing 'exit'.\n\n") csvAttributeList = re.split('\W+', csvAttributeString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += 'AlbumID' elif attribute == 'AlbumName'.lower(): csvRowString += 'AlbumName' elif attribute == 'ArtistID'.lower(): csvRowString += 'ArtistID' elif attribute == 'ArtistLatitude'.lower(): csvRowString += 'ArtistLatitude' elif attribute == 'ArtistLocation'.lower(): csvRowString += 'ArtistLocation' elif attribute == 'ArtistLongitude'.lower(): csvRowString += 'ArtistLongitude' elif attribute == 'ArtistName'.lower(): csvRowString += 'ArtistName' elif attribute == 'Danceability'.lower(): csvRowString += 'Danceability' elif attribute == 'Duration'.lower(): csvRowString += 'Duration' elif attribute == 'KeySignature'.lower(): csvRowString += 'KeySignature' elif attribute == 'KeySignatureConfidence'.lower(): csvRowString += 'KeySignatureConfidence' elif attribute == 'SongID'.lower(): csvRowString += "SongID" elif attribute == 'Tempo'.lower(): csvRowString += 'Tempo' elif attribute == 'TimeSignature'.lower(): csvRowString += 'TimeSignature' elif attribute == 'TimeSignatureConfidence'.lower(): csvRowString += 'TimeSignatureConfidence' elif attribute == 'Title'.lower(): csvRowString += 'Title' elif attribute == 'Year'.lower(): csvRowString += 'Year' elif attribute == 'Exit'.lower(): sys.exit() else: prompt = True print "==============" print "I believe there has been an error with the input." print "==============" break csvRowString += "," lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" #else, if you want to hard code the order of the csv file and not prompt #the user, else: ################################################# #change the order of the csv file here #Default is to list all available attributes (in alphabetical order) csvRowString = ( "SongID,AlbumID,AlbumName,TrackId,ArtistID,ArtistLatitude,ArtistLocation," + "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," + "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence," + "Title,Year") ################################################# csvAttributeList = re.split('\W+', csvRowString) for i, v in enumerate(csvAttributeList): csvAttributeList[i] = csvAttributeList[i].lower() outputFile1.write("SongNumber,") outputFile1.write(csvRowString + "\n") csvRowString = "" ################################################# #Set the basedir here, the root directory from which the search #for files stored in a (hierarchical data structure) will originate basedir = "/home/umwangye/millonsong/MillionSongSubset/data/" # "." As the default means the current directory ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files. ################################################# #FOR LOOP for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) for f in files: print f songH5File = hdf5_getters.open_h5_file_read(f) #song = Song(str(hdf5_getters.get_song_id(songH5File))) #testDanceability = hdf5_getters.get_danceability(songH5File) # print type(testDanceability) # print ("Here is the danceability: ") + str(testDanceability) numPerH5 = hdf5_getters.get_num_songs(songH5File) for cnt in range(numPerH5): song = Song(str(hdf5_getters.get_song_id(songH5File, cnt))) song.trackId = str(hdf5_getters.get_track_id(songH5File, cnt)) song.artistID = str(hdf5_getters.get_artist_id( songH5File, cnt)) song.albumID = str( hdf5_getters.get_release_7digitalid(songH5File, cnt)) song.albumName = str(hdf5_getters.get_release(songH5File, cnt)) song.artistLatitude = str( hdf5_getters.get_artist_latitude(songH5File, cnt)) song.artistLocation = str( hdf5_getters.get_artist_location(songH5File, cnt)) song.artistLongitude = str( hdf5_getters.get_artist_longitude(songH5File, cnt)) song.artistName = str( hdf5_getters.get_artist_name(songH5File, cnt)) song.danceability = str( hdf5_getters.get_danceability(songH5File, cnt)) song.duration = str(hdf5_getters.get_duration(songH5File, cnt)) # song.setGenreList() song.keySignature = str(hdf5_getters.get_key(songH5File, cnt)) song.keySignatureConfidence = str( hdf5_getters.get_key_confidence(songH5File, cnt)) # song.lyrics = None # song.popularity = None song.tempo = str(hdf5_getters.get_tempo(songH5File, cnt)) song.timeSignature = str( hdf5_getters.get_time_signature(songH5File, cnt)) song.timeSignatureConfidence = str( hdf5_getters.get_time_signature_confidence( songH5File, cnt)) song.title = str(hdf5_getters.get_title(songH5File, cnt)) song.year = str(hdf5_getters.get_year(songH5File, cnt)) #print song count csvRowString += str(song.songCount) + "," for attribute in csvAttributeList: # print "Here is the attribute: " + attribute + " \n" if attribute == 'AlbumID'.lower(): csvRowString += song.albumID elif attribute == 'AlbumName'.lower(): albumName = song.albumName albumName = albumName.replace(',', "") csvRowString += "\"" + albumName + "\"" elif attribute == 'TrackId'.lower(): csvRowString += song.trackId elif attribute == 'ArtistID'.lower(): csvRowString += "\"" + song.artistID + "\"" elif attribute == 'ArtistLatitude'.lower(): latitude = song.artistLatitude if latitude == 'nan': latitude = '' csvRowString += latitude elif attribute == 'ArtistLocation'.lower(): location = song.artistLocation location = location.replace(',', '') csvRowString += "\"" + location + "\"" elif attribute == 'ArtistLongitude'.lower(): longitude = song.artistLongitude if longitude == 'nan': longitude = '' csvRowString += longitude elif attribute == 'ArtistName'.lower(): csvRowString += "\"" + song.artistName + "\"" elif attribute == 'Danceability'.lower(): csvRowString += song.danceability elif attribute == 'Duration'.lower(): csvRowString += song.duration elif attribute == 'KeySignature'.lower(): csvRowString += song.keySignature elif attribute == 'KeySignatureConfidence'.lower(): # print "key sig conf: " + song.timeSignatureConfidence csvRowString += song.keySignatureConfidence elif attribute == 'SongID'.lower(): csvRowString += "\"" + song.id + "\"" elif attribute == 'Tempo'.lower(): # print "Tempo: " + song.tempo csvRowString += song.tempo elif attribute == 'TimeSignature'.lower(): csvRowString += song.timeSignature elif attribute == 'TimeSignatureConfidence'.lower(): # print "time sig conf: " + song.timeSignatureConfidence csvRowString += song.timeSignatureConfidence elif attribute == 'Title'.lower(): csvRowString += "\"" + song.title + "\"" elif attribute == 'Year'.lower(): csvRowString += song.year else: csvRowString += "Erm. This didn't work. Error. :( :(\n" csvRowString += "," #Remove the final comma from each row in the csv lastIndex = len(csvRowString) csvRowString = csvRowString[0:lastIndex - 1] csvRowString += "\n" outputFile1.write(csvRowString) csvRowString = "" songH5File.close() outputFile1.close()
import csv import math import hdf5_getters import operator print("loading...") h5 = hdf5_getters.open_h5_file_read("files/msd_summary_file.h5") data = [] length = hdf5_getters.get_num_songs(h5) print("number of songs = ", length) count = 0 for i in range(0, length): tmp = [] if hdf5_getters.get_year(h5, songidx=i) == 0: continue #if math.isnan(hdf5_getters.get_artist_latitude(h5,songidx=i)) and hdf5_getters.get_artist_location(h5,songidx=i) =='': # continue; count += 1 tmp.append( str(hdf5_getters.get_track_id(h5, songidx=i)).replace("b'", "").replace("'", "")) tmp.append(hdf5_getters.get_year(h5, songidx=i)) #0 tmp.append(hdf5_getters.get_song_hotttnesss(h5, songidx=i)) #1 tmp.append( str(hdf5_getters.get_title(h5,
getters1 = get_getters(list_attr1) #run through each .h5 file contained in the folder progression = 0 interval = 0 for folder, subfolders, files in os.walk(data_folder): for f in files: if f.endswith('.h5') and not f.startswith('._'): #open the hdf5 file h5 = hdf5_getters.open_h5_file_read(folder + '/' + f) #add one entry line in the database per row contained in the file for song_nb in range(hdf5_getters.get_num_songs(h5)): write_line(metadata,getters1,h5,song_nb) progression += 1 interval += 1 if interval == 1000: print(progression) interval = 0 h5.close() metadata.close() print('nb songs = %d' %progression) elapsed_time=time.time()-start_time print('elapsed time = ' + str(elapsed_time) + ' sec')
def transfer(h5path,matpath=None,force=False): """ Transfer an HDF5 song file (.h5) to a matfile (.mat) If there are more than one song in the HDF5 file, each field name gets a number happened: 1, 2, 3, ...., numfiles PARAM h5path - path to the HDF5 song file matpath - path to the new matfile, same as HDF5 path with a different extension by default force - if True and matfile exists, overwrite RETURN True if the file was transfered, False if there was a problem. Could also raise an IOException NOTE All the data has to be loaded in memory! be careful if one file contains tons of songs! """ # sanity checks if not os.path.isfile(h5path): print 'path to HF5 files does not exist:',h5path return False if not os.path.splitext(h5path)[1] == '.h5': print 'expecting a .h5 extension for file:',h5path return False # check matfile if matpath is None: matpath = os.path.splitext(h5path)[0] + '.mat' if os.path.exists(matpath): if force: print 'overwriting file:',matpath else: print 'matfile',matpath,'already exists (delete or force):' return False # get all getters! we assume that all we need is in hdf5_getters.py # further assume that they have the form get_blablabla and that's the # only thing that has that form getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()) getters.remove("get_num_songs") # special case # open h5 file h5 = hdf5_getters.open_h5_file_read(h5path) # transfer nSongs = hdf5_getters.get_num_songs(h5) matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path} try: # iterate over songs for songidx in xrange(nSongs): # iterate over getter for getter in getters: gettername = getter[4:] if nSongs > 1: gettername += str(songidx+1) data = hdf5_getters.__getattribute__(getter)(h5,songidx) matdata[gettername] = data except MemoryError: print 'Memory Error with file:',h5path print 'All data has to be loaded in memory before being saved as matfile' print 'Is this an aggregated / summary file with tons of songs?' print 'This code is optimized for files containing one song,' print 'but write me an email! (TBM)' raise finally: # close h5 h5.close() # create sio.savemat(matpath,matdata) # all good return True
def get_all_examples(basedir, genre_dict, ext='.h5'): """ From a base directory, goes through all subdirectories, and grabs all songs and their features and puts them into a pandas dataframe INPUT basedir - base directory of the dataset genre_dict - a dictionary mapping track id to genre based tagraum dataset ext - extension, .h5 by default RETURN dataframe containing all song examples """ features_vs_genre = pd.DataFrame() # iterate over all files in all subdirectories for root, dirs, files in os.walk(basedir): files = glob.glob(os.path.join(root, '*' + ext)) # # count files # count += len(files) # apply function to all files for f in files: h5 = GETTERS.open_h5_file_read(f) num_songs = GETTERS.get_num_songs(h5) for i in range(num_songs): if i % 10000 == 0: print(i) song_id = GETTERS.get_track_id(h5, i).decode('utf-8') if (song_id in genre_dict): genre = genre_dict[song_id] year = GETTERS.get_year(h5, i) duration = GETTERS.get_duration(h5, i) end_of_fade_in = GETTERS.get_end_of_fade_in(h5, i) loudness = GETTERS.get_loudness(h5, i) song_hotttnesss = GETTERS.get_song_hotttnesss(h5, i) tempo = GETTERS.get_tempo(h5, i) key = GETTERS.get_key(h5, i) key_confidence = GETTERS.get_key_confidence(h5, i) mode = GETTERS.get_mode(h5, i) mode_confidence = GETTERS.get_mode_confidence(h5, i) time_signature = GETTERS.get_time_signature(h5, i) time_signature_confidence = GETTERS.get_time_signature_confidence( h5, i) artist_name = GETTERS.get_artist_name(h5) title = GETTERS.get_title(h5) # length of sections_start array gives us number of start # num_sections = len(GETTERS.get_sections_start(h5)) # num_segments = len(GETTERS.get_segments_confidence(h5)) example = pd.DataFrame(data=[(artist_name, title, song_id, genre, year, key, key_confidence, mode, mode_confidence, time_signature, time_signature_confidence, duration, end_of_fade_in, loudness, song_hotttnesss, tempo)], columns=['artist_name', 'title', 'song_id', 'genre', 'year', 'key', 'key_confidence', 'mode', 'mode_confidence', 'time_signature', 'time_signature_confidence', 'duration', 'end_of_fade_in', 'loudness', 'song_hotttnesss', 'tempo']) features_vs_genre = features_vs_genre.append(example) h5.close() return features_vs_genre
def getInfo(files): data = [] build_str = '' with open(sys.argv[1], 'r') as f: contents = f.read() c = contents.split() f.close() print("creating csv with following fields:" + contents) for i in c: build_str = build_str + i + ',' build_str = build_str[:-1] build_str = build_str + '\n' for fil in files: curFile = getters.open_h5_file_read(fil) d2 = {} get_table = {'track_id': getters.get_track_id(curFile), 'segments_pitches': getters.get_segments_pitches(curFile), 'time_signature_confidence': getters.get_time_signature_confidence(curFile), 'song_hotttnesss': getters.get_song_hotttnesss(curFile), 'artist_longitude': getters.get_artist_longitude(curFile), 'tatums_confidence': getters.get_tatums_confidence(curFile), 'num_songs': getters.get_num_songs(curFile), 'duration': getters.get_duration(curFile), 'start_of_fade_out': getters.get_start_of_fade_out(curFile), 'artist_name': getters.get_artist_name(curFile), 'similar_artists': getters.get_similar_artists(curFile), 'artist_mbtags': getters.get_artist_mbtags(curFile), 'artist_terms_freq': getters.get_artist_terms_freq(curFile), 'release': getters.get_release(curFile), 'song_id': getters.get_song_id(curFile), 'track_7digitalid': getters.get_track_7digitalid(curFile), 'title': getters.get_title(curFile), 'artist_latitude': getters.get_artist_latitude(curFile), 'energy': getters.get_energy(curFile), 'key': getters.get_key(curFile), 'release_7digitalid': getters.get_release_7digitalid(curFile), 'artist_mbid': getters.get_artist_mbid(curFile), 'segments_confidence': getters.get_segments_confidence(curFile), 'artist_hotttnesss': getters.get_artist_hotttnesss(curFile), 'time_signature': getters.get_time_signature(curFile), 'segments_loudness_max_time': getters.get_segments_loudness_max_time(curFile), 'mode': getters.get_mode(curFile), 'segments_loudness_start': getters.get_segments_loudness_start(curFile), 'tempo': getters.get_tempo(curFile), 'key_confidence': getters.get_key_confidence(curFile), 'analysis_sample_rate': getters.get_analysis_sample_rate(curFile), 'bars_confidence': getters.get_bars_confidence(curFile), 'artist_playmeid': getters.get_artist_playmeid(curFile), 'artist_terms_weight': getters.get_artist_terms_weight(curFile), 'segments_start': getters.get_segments_start(curFile), 'artist_location': getters.get_artist_location(curFile), 'loudness': getters.get_loudness(curFile), 'year': getters.get_year(curFile), 'artist_7digitalid': getters.get_artist_7digitalid(curFile), 'audio_md5': getters.get_audio_md5(curFile), 'segments_timbre': getters.get_segments_timbre(curFile), 'mode_confidence': getters.get_mode_confidence(curFile), 'end_of_fade_in': getters.get_end_of_fade_in(curFile), 'danceability': getters.get_danceability(curFile), 'artist_familiarity': getters.get_artist_familiarity(curFile), 'artist_mbtags_count': getters.get_artist_mbtags_count(curFile), 'tatums_start': getters.get_tatums_start(curFile), 'artist_id': getters.get_artist_id(curFile), 'segments_loudness_max': getters.get_segments_loudness_max(curFile), 'bars_start': getters.get_bars_start(curFile), 'beats_start': getters.get_beats_start(curFile), 'artist_terms': getters.get_artist_terms(curFile), 'sections_start': getters.get_sections_start(curFile), 'beats_confidence': getters.get_beats_confidence(curFile), 'sections_confidence': getters.get_sections_confidence(curFile)} tid = fil.split('/')[-1].split('.')[0] # print(c) for i in c: if i in get_table: d2[i] = get_table[i] d2[i] = str(d2[i]).replace('\n','') build_str = build_str + d2[i] + ',' else: print('error: unspecified field') exit(0) build_str = build_str[:-1] # print(build_str[:-1]) build_str = build_str + '\n' curFile.close() build_str = build_str.replace('b','').replace("'",'').replace('"','') return (build_str)