Example #1
0
def get_attribute(files):
    array = []
    count = 0
    for f in files:
        temp = []
        count += 1
        print(f)
        h5 = hdf5_getters.open_h5_file_read(f)
        temp.append(hdf5_getters.get_num_songs(h5))
        temp.append(hdf5_getters.get_artist_familiarity(h5))
        temp.append(hdf5_getters.get_artist_hotttnesss(h5))
        temp.append(hdf5_getters.get_danceability(h5))
        temp.append(hdf5_getters.get_energy(h5))
        temp.append(hdf5_getters.get_key(h5))
        temp.append(hdf5_getters.get_key_confidence(h5))
        temp.append(hdf5_getters.get_loudness(h5))
        temp.append(hdf5_getters.get_mode(h5))
        temp.append(hdf5_getters.get_mode_confidence(h5))
        temp.append(hdf5_getters.get_tempo(h5))
        temp.append(hdf5_getters.get_time_signature(h5))
        temp.append(hdf5_getters.get_time_signature_confidence(h5))
        temp.append(hdf5_getters.get_title(h5))
        temp.append(hdf5_getters.get_artist_name(h5))
        temp = np.nan_to_num(temp)
        array.append(temp)
        # if count%100 ==0:
        # print(array[count-100:count-1])
        # kmean.fit(array[count-100:count-1])
        h5.close()
    return array
Example #2
0
def parse_songs(directory):
    global count
    global MAX_SONGS
    for filename in os.listdir(directory):
        if count >= MAX_SONGS: return
        file_path = os.path.join(directory, filename)
        if os.path.isdir(file_path):
            parse_songs(file_path)
        else:
            count += 1
            if count % 100 == 0: print('Parsed ' + str(count) + ' songs')

            with hdf5_getters.open_h5_file_read(file_path) as h5:
                for i in range(hdf5_getters.get_num_songs(h5)):
                    title = hdf5_getters.get_title(h5, i).decode('UTF-8')
                    year = hdf5_getters.get_year(h5, i).item()
                    danceability = hdf5_getters.get_danceability(h5, i).item()
                    tags = hdf5_getters.get_artist_mbtags(h5, i).tolist()
                    genres = [tag.decode('UTF-8') for tag in tags]
                    tempo = hdf5_getters.get_tempo(h5, i).item()

                    song = {
                        'title': title,
                        'year': year,
                        'danceability': danceability,
                        'genres': genres,
                        'tempo': tempo
                    }
                    song = os.path.splitext(filename)
                    with open(
                            "/home/ubuntu/million_songs/parsed_data/" +
                            song[0] + '.json', 'w') as fp:
                        json.dump(song, fp)
Example #3
0
    def load(self, msd_summary_file):
        self.msd_summary_file = msd_summary_file
        self.check()

        self.h5_fd = hdf5_getters.open_h5_file_read(self.msd_summary_file)
        self.num_songs = int(hdf5_getters.get_num_songs(self.h5_fd))
        logger.debug("Found {} songs in summary file".format(self.num_songs))
Example #4
0
def traverseAndWrite(root, genreDirs, genreKeys):
    if not isfile(root):
        for f in listdir(root):
            traverseAndWrite(root + "/" + f,genreDirs, genreKeys)
    else:
        h5 = hdf5_getters.open_h5_file_read(root)
        numOfSongs = hdf5_getters.get_num_songs(h5)
        for index in range(numOfSongs):
            tags = hdf5_getters.get_artist_mbtags(h5,index)
            # print tags
            artist = hdf5_getters.get_artist_name(h5,index)
            songName = hdf5_getters.get_title(h5,index)
            segmentTimbre = hdf5_getters.get_segments_timbre(h5,index)
            segmentPitches = hdf5_getters.get_segments_pitches(h5,index)
            if notValidSong(tags, artist, songName, segmentTimbre, segmentPitches):
                h5.close()
                continue
            for genre in genreKeys:
                if genreInTags(genre,tags):
                    song = {}
                    song['genre'] = genre
                    song['artist_name'] = artist
                    song['song_title'] = songName
                    song['segments_pitches'] = segmentPitches.tolist()
                    song['segments_timbre'] = segmentTimbre.tolist()

                    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
                    songName = ''.join(c for c in songName if c in valid_chars)
                    artist = ''.join(c for c in artist if c in valid_chars)
                    fd = open(genreDirs[genre]+"/"+artist+"--"+songName+".json",'a')
                    writeToDescriptor(fd,song)
                    fd.close()
        h5.close()
Example #5
0
def csv_convert(basedir, csv_filename):
	'''Function to convert all the information in a h5 file to a csv file, one song per line
		Inputs: basedir, a string of subdirectory within the current directory
			csv_filename, a filename where all the information will be written down'''
	t1=time.time()
	cnt=0
	with open("/home/ec2-user/{}".format(csv_filename), "w") as csv_file:
		csv_file.write(header)
		for root, dirs, files in os.walk(basedir):
			files=glob.glob(os.path.join(root, '*.h5'))
			for f in files:
				h5=gt.open_h5_file_read(f)
				##each h5 file actually has multiple songs
				num_songs=gt.get_num_songs(h5)
				for j in range(int(num_songs)):
					if validate_song(h5, j):
						cnt+=1
						csv_file.write(h5_to_csv_fields(h5,j))
						#sanity check to make sure this is working
						if cnt%10==0:
							print("{} files csved thus far".format(cnt))
				#remember to close your files or you run out of memory...
				h5.close()
	t2=time.time()
	print ('directory {} csv-ed in:'.format(basedir), strtimedelta(t1,t2))
Example #6
0
def get(getters, h5file):
	# sanity check
	if not os.path.isfile(h5file):
		print 'ERROR: file', h5file, 'does not exist.'
		sys.exit(0)
	h5 = hdf5_getters.open_h5_file_read(h5file)
	numSongs = hdf5_getters.get_num_songs(h5)
	songidx = 0
	if songidx >= numSongs:
		print 'ERROR: file contains only',numSongs
		h5.close()
		sys.exit(0)

	line = dict()
	for getter in getters:
		try:
			res = hdf5_getters.__getattribute__('get_' + getter)(h5,songidx)
		except AttributeError, e:
				print e
		if res.__class__.__name__ == 'ndarray':
			# print getter[4:]+": shape =",res.shape
			# How to put multidimensional values into file. 
			# Try to put only mean of the values etc...
			print 'Ignoring....'
		else:
			# print getter[4:]+":",res
			line[getter] = res
Example #7
0
def h5file_data(h5file):
    with lock:
        h5 = hdf5_getters.open_h5_file_read(h5file)
        num_songs = hdf5_getters.get_num_songs(h5)

        for i in range(num_songs):
            yield get_h5_data(h5, i)

        h5.close()
Example #8
0
def add_to_database(con, full_path):
    data = h5.open_h5_file_read(full_path)
    number_of_songs = h5.get_num_songs(data)
    for song_index in xrange(0, number_of_songs):
        artist_id = get_artist_id(con, h5.get_artist_mbid(data, song_index))
        if artist_id == -1:
            artist_id = add_data_to_artists_table(con, data, song_index)
            add_data_to_artists_rel_table(con, data, song_index, artist_id)
        add_data_to_songs_table(con, data, artist_id)
    data.close()
def h5_to_df(hdf5_file):
    
    df = []
    cols = song_funs.keys()
    
    for i in xrange(hdf5_getters.get_num_songs(hdf5_file)):
        row = []
        for col in cols:
            row.append(song_funs[col](hdf5_file, i))
        df.append(row)     
    
    return pd.DataFrame(df, columns=cols)
def create_idix(h5, msd_path):
    '''
        Creates indices for the million songs.
        The reason is because songs are accessed as indices from 0
        to (the maximum number of songs - 1) and not SongIDs.
    '''
    import hdf5_getters
    totsng = hdf5_getters.get_num_songs(h5)
    idixdic = dict()
    for count in range(0, totsng):
        idixdic[hdf5_getters.get_song_id(h5, count)] = count
    idixfile = open("songidix.txt", "wb")
    pickle.dump(idixdic, idixfile)
    idixfile.close()
def create_idix(h5, msd_path):
    '''
        Creates indices for the million songs.
        The reason is because songs are accessed as indices from 0
        to (the maximum number of songs - 1) and not SongIDs.
    '''
    import hdf5_getters
    totsng = hdf5_getters.get_num_songs(h5)
    idixdic = dict()
    for count in range(0, totsng):
        idixdic[hdf5_getters.get_song_id(h5, count)] = count
    idixfile = open("songidix.txt", "wb")
    pickle.dump(idixdic, idixfile)
    idixfile.close()
Example #12
0
def traverseAndWrite(root, genreKeys, counts):
    if not isfile(root):
        for f in listdir(root):
            traverseAndWrite(root + "/" + f,genreKeys,counts)
    else:
        h5 = hdf5_getters.open_h5_file_read(root)
        numOfSongs = hdf5_getters.get_num_songs(h5)
        for index in range(numOfSongs):
            tags = hdf5_getters.get_artist_mbtags(h5,index)
            # print tags
            for genre in genreKeys:
                if genreInTags(genre,tags):
                    counts[genre] +=1
        print counts 
        h5.close()
Example #13
0
def main():
    path = '/Users/maxenchung/Desktop/untitled/MillionSongSubset/data/'
    directories1 = os.listdir(path)
    for directory1 in directories1:  #A
        directories2 = os.listdir(path + directory1)
        for directory2 in directories2:  #A/A
            directories3 = os.listdir(path + directory1 + '/' + directory2)
            for directory3 in directories3:  #A/A/A
                file_path = path + directory1 + '/' + directory2 + '/' + directory3 + '/'
                files = os.listdir(file_path)
                for filename in files:
                    h5_file = h5g.open_h5_file_read(file_path + filename)
                    num_songs = h5g.get_num_songs(h5_file)
                    print(file_path + filename)
                    for song_index in range(0, num_songs):
                        print(build_h5_dictionary(h5_file, song_index))
Example #14
0
def get_csv_rows(h5file,fieldType='all'):
	h5 = hdf5_getters.open_h5_file_read(h5file)

	output = []

	for i in range(hdf5_getters.get_num_songs(h5)):
		data = get_csv_h5_data(h5,i,fieldType)
		out = "\",\"".join(data)
		out = out.replace("\n","")
		out = out.replace("\r","")
		out = "\"" + out + "\""
		output.append(out)
		output.append("\n")
	h5.close()

	return ''.join(output)
def get_song_info(song_path, pickle_path):

    #Create a dictionary with fields and dump in pickle
    data = {}
    data['pickle_id'] = get_song_id(song_path)
    #print data['pickle_id']

    # get params
    hdf5path = song_path
    songidx = 0
    onegetter = ''

    # if len(sys.argv) > 2:
    #     songidx = int(sys.argv[2])
    # if len(sys.argv) > 3:
    #     onegetter = sys.argv[3]

    # sanity check
    if not os.path.isfile(hdf5path):
        print 'ERROR: file', hdf5path, 'does not exist.'
        sys.exit(0)
    h5 = hdf5_getters.open_h5_file_read(hdf5path)
    numSongs = hdf5_getters.get_num_songs(h5)
    if songidx >= numSongs:
        print 'ERROR: file contains only', numSongs
        h5.close()
        sys.exit(0)

    # get all getters
    getters = get_modified_getters()
    #print getters

    # print them
    for getter in getters:
        try:
            res = hdf5_getters.__getattribute__(getter)(h5, songidx)
        except AttributeError, e:
            if summary:
                continue
            else:
                print e
                print 'forgot -summary flag? specified wrong getter?'
        if res.__class__.__name__ == 'ndarray':
            print getter[4:] + ": shape =", res.shape
        else:
            data[getter[4:]] = str(res)
Example #16
0
 def _handle_h5_file(self, filename):
     h5 = hdf5_getters.open_h5_file_read(filename)
     num_songs = hdf5_getters.get_num_songs(h5)
     if not self.getters:
         self.getters = self._get_getters(h5)
         getter_row = [getter[4:] for getter in self.getters]
         self.writer.writerow(getter_row)
     for i in range(num_songs):
         result = []
         for getter in self.getters:
             hdf5_getter = getattr(hdf5_getters, getter)
             value = hdf5_getter(h5, i)
             if value.__class__.__name__ == 'ndarray':
                 # Special case for ndarray types
                 value = json.dumps(value.tolist())
             result.append(value)
         self.writer.writerow(result)
     h5.close()
 def _handle_h5_file(self, filename):
     h5 = hdf5_getters.open_h5_file_read(filename)
     num_songs = hdf5_getters.get_num_songs(h5)
     if not self.getters:
         self.getters = self._get_getters(h5)
         getter_row = [getter[4:] for getter in self.getters]
         self.writer.writerow(getter_row)
     for i in xrange(num_songs):
         result = []
         for getter in self.getters:
             hdf5_getter = getattr(hdf5_getters, getter)
             value = hdf5_getter(h5, i)
             if value.__class__.__name__ == 'ndarray':
                 # Special case for ndarray types
                 value = json.dumps(value.tolist())
             result.append(value)
         self.writer.writerow(result)
     h5.close()
Example #18
0
def parse_file(h5):
    data = []
    out = open("out/msd_orig.txt", "a")
    num_songs = hdf5_getters.get_num_songs(h5)
    for i in range(990000, num_songs):
        item = {}
        item["msid"] = H5.get_track_id(h5, i)
        item["artist_name"] = H5.get_artist_name(h5, i)
        item["energy"] = H5.get_energy(h5, i)
        item["loudness"] = H5.get_loudness(h5, i)
        item["tempo"] = H5.get_tempo(h5, i)
        item["year"] = H5.get_year(h5, i)
        data.append(item)

        if "\t" in item["artist_name"]:
            print "!Warning! Tab found in artist name:", item["artist_name"]

        if i > 990000 and i % 10000 == 0:
            print "Writing %d to %d." % (i - 10000, i)
            output = ""
            for j in range(0, len(data)):
                s = data[j]
                output += "%s\t%s\t%s\t%s\t%s\t%s\n" % (\
                 s["msid"], \
                 s["artist_name"], \
                 s["energy"], \
                 s["loudness"], \
                 s["tempo"], \
                 s["year"])
            out.write(output)
            data = []

    output = ""
    for j in range(0, len(data)):
        s = data[j]
        output += "%s\t%s\t%s\t%s\t%s\t%s\n" % (\
         s["msid"], \
         s["artist_name"], \
         s["energy"], \
         s["loudness"], \
         s["tempo"], \
         s["year"])
    out.write(output)
    out.close()
def get_attribute(f):
    temp = []
    count += 1
    print(f)
    h5 = hdf5_getters.open_h5_file_read(f)
    temp.append(hdf5_getters.get_num_songs(h5))
    temp.append(hdf5_getters.get_artist_familiarity(h5))
    temp.append(hdf5_getters.get_artist_hotttnesss(h5))
    temp.append(hdf5_getters.get_danceability(h5))
    temp.append(hdf5_getters.get_energy(h5))
    temp.append(hdf5_getters.get_key(h5))
    temp.append(hdf5_getters.get_key_confidence(h5))
    temp.append(hdf5_getters.get_loudness(h5))
    temp.append(hdf5_getters.get_mode(h5))
    temp.append(hdf5_getters.get_mode_confidence(h5))
    temp.append(hdf5_getters.get_tempo(h5))
    temp.append(hdf5_getters.get_time_signature(h5))
    temp.append(hdf5_getters.get_time_signature_confidence(h5))
    temp = np.nan_to_num(temp)
    array.append(temp)
    h5.close()
Example #20
0
def main():
    path = '/Users/maxenchung/Desktop/untitled/MillionSongSubset/data/'
    directories1 = os.listdir(path)
    writeList = list()
    for directory1 in directories1:  #A
        directories2 = os.listdir(path + directory1)
        for directory2 in directories2:  #A/A
            directories3 = os.listdir(path + directory1 + '/' + directory2)
            for directory3 in directories3:  #A/A/A
                file_path = path + directory1 + '/' + directory2 + '/' + directory3 + '/'
                files = os.listdir(file_path)
                for filename in files:
                    with h5g.open_h5_file_read(file_path +
                                               filename) as h5_file:
                        num_songs = h5g.get_num_songs(h5_file)
                        print(file_path + filename)
                        for song_index in range(0, num_songs):
                            my_dict = build_h5_dictionary(h5_file, song_index)
                            newList = clean_h5_dictionary(my_dict)
                            #See if newList is empty, if it is not then add it to writelist to be put into the csv file
                            if not newList:
                                continue
                            else:
                                writeList.append(newList)

    #Get the keys from my_dict and then sort them and store the sorted keys in k
    k = my_dict.keys()
    k = sorted(k)

    print(writeList)
    #First write the keys to the output.csv file
    with open('output.csv', 'w') as csvfile:
        mywriter = csv.writer(csvfile)
        mywriter.writerow(k)

    #Now write the different songs values to the output.csv file
    with open('output.csv', 'a') as csvfile:
        mywriter = csv.writer(csvfile)
        mywriter.writerows(writeList)
sys.path.append(os.path.join(msd_code_path,'PythonSrc'))

# imports specific to the MSD
import hdf5_getters as GETTERS


cnt = 0
loops = 0


for alpha in string.ascii_uppercase :
   for root, dirs, files in os.walk('/mnt/million-songs/data/'+alpha):
      files = glob.glob(os.path.join(root,'*'+'.h5'))
      for f in files :
         h5 = GETTERS.open_h5_file_read(f)
         num_songs = GETTERS.get_num_songs(h5)
         print f, num_songs

         for i in range(num_songs):
            analysis_sample_rate = GETTERS.get_analysis_sample_rate(h5, i)
            artist_7digitalid = GETTERS.get_artist_7digitalid(h5, i)
            artist_familiarity = GETTERS.get_artist_familiarity(h5, i)
            artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5, i)
            artist_id = GETTERS.get_artist_id(h5, i)
            artist_latitude = GETTERS.get_artist_latitude(h5, i)
            artist_location = GETTERS.get_artist_location(h5, i)
            artist_longitude = GETTERS.get_artist_longitude(h5, i)
            artist_mbid = GETTERS.get_artist_mbid(h5, i)
            artist_mbtags = ','.join(str(e) for e in GETTERS.get_artist_mbtags(h5, i)) # array
            artist_mbtags_count = ','.join(str(e) for e in GETTERS.get_artist_mbtags_count(h5, i)) # array
            artist_name = GETTERS.get_artist_name(h5, i)
Example #22
0
    # get params
    hdf5path = sys.argv[1]
    songidx = 0
    if len(sys.argv) > 2:
        songidx = int(sys.argv[2])
    onegetter = ''
    if len(sys.argv) > 3:
        onegetter = sys.argv[3]

    # sanity check
    if not os.path.isfile(hdf5path):
        print 'ERROR: file',hdf5path,'does not exist.'
        sys.exit(0)
    h5 = hdf5_getters.open_h5_file_read(hdf5path)
    numSongs = hdf5_getters.get_num_songs(h5)
    if songidx >= numSongs:
        print 'ERROR: file contains only',numSongs
        h5.close()
        sys.exit(0)

    # get all getters
    getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters.remove("get_num_songs") # special case
    if onegetter == 'num_songs' or onegetter == 'get_num_songs':
        getters = []
    elif onegetter != '':
        if onegetter[:4] != 'get_':
            onegetter = 'get_' + onegetter
        try:
            getters.index(onegetter)
Example #23
0
def transfer(h5path, matpath=None, force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print 'path to HF5 files does not exist:', h5path
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print 'expecting a .h5 extension for file:', h5path
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if force:
            print 'overwriting file:', matpath
        else:
            #            print 'matfile',matpath,'already exists (delete or force):'
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters.remove("get_num_songs")  # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {
        'transfer_note':
        'transferred on ' + time.ctime() + ' from file: ' + h5path
    }
    try:
        # iterate over songs
        for songidx in xrange(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx + 1)
                data = hdf5_getters.__getattribute__(getter)(h5, songidx)
                matdata[gettername] = data
    except MemoryError:
        print 'Memory Error with file:', h5path
        print 'All data has to be loaded in memory before being saved as matfile'
        print 'Is this an aggregated / summary file with tons of songs?'
        print 'This code is optimized for files containing one song,'
        print 'but write me an email! (TBM)'
        raise
    finally:
        # close h5
        h5.close()
    # create
    sio.savemat(matpath, matdata)
    # all good
    return True
Example #24
0
              'track_7digitalid', 'year')
sep = '\t'
metadata.write(sep.join(list_attr1) + '\n')

getters1 = get_getters(list_attr1)

#run through each .h5 file contained in the folder
progression = 0
interval = 0
for folder, subfolders, files in os.walk(data_folder):
    for f in files:
        if f.endswith('.h5') and not f.startswith('._'):
            #open the hdf5 file
            h5 = hdf5_getters.open_h5_file_read(folder + '/' + f)

            #add one entry line in the database per row contained in the file
            for song_nb in range(hdf5_getters.get_num_songs(h5)):
                write_line(metadata, getters1, h5, song_nb)
                progression += 1
                interval += 1
                if interval == 1000:
                    print(progression)
                    interval = 0
            h5.close()

metadata.close()

print('nb songs = %d' % progression)
elapsed_time = time.time() - start_time
print('elapsed time = ' + str(elapsed_time) + ' sec')
Example #25
0
import csv
import math
import hdf5_getters
import operator

print("loading...")
h5 = hdf5_getters.open_h5_file_read("files/msd_summary_file.h5")
data = []

length = hdf5_getters.get_num_songs(h5)

print("number of songs = ",length)

count = 0;
for i in range(0,length):
	tmp = [];
	if hdf5_getters.get_year(h5,songidx=i) == 0 :
		continue;
	#if math.isnan(hdf5_getters.get_artist_latitude(h5,songidx=i)) and hdf5_getters.get_artist_location(h5,songidx=i) =='':
	#	continue;
	count+=1;
	tmp.append(str(hdf5_getters.get_track_id(h5,songidx=i)).replace("b'","").replace("'",""));	
	tmp.append(hdf5_getters.get_year(h5,songidx=i)); #0
	tmp.append(hdf5_getters.get_song_hotttnesss(h5,songidx=i)); #1
	tmp.append(str(hdf5_getters.get_title(h5,songidx=i)).replace("b'","").replace("'",""));	#2
	tmp.append(str(hdf5_getters.get_artist_id(h5,songidx=i)).replace("b'","").replace("'","")); #3	
	tmp.append(hdf5_getters.get_artist_latitude(h5,songidx=i)); #4
	tmp.append(hdf5_getters.get_artist_longitude(h5,songidx=i)); #5
	tmp.append(str(hdf5_getters.get_artist_location(h5,songidx=i)).replace("b'","").replace("'","")); #6
	tmp.append(str(hdf5_getters.get_artist_name(h5,songidx=i)).replace("b'","").replace("'","")); #7
	tmp.append(str(hdf5_getters.get_song_id(h5,songidx=i)).replace("b'","").replace("'",""));	
	#print "Calc: ", song.hotness, " + ", song.chart_score
	retval = 0.3*float(song.hotness) + 0.7*float(song.chart_score)
	return retval

class Song:
	name = ""
	artist = ""
	year = 0; #Quantity in given year
	hotness = 0;
	chart_score = 0;

	pop_score = 0.0;


h5 = h5get.open_h5_file_read("subset_msd_summary_file.h5")
numSongs = h5get.get_num_songs(h5)

#all_chart_info = {}

"""
with open('tsort-chart-2-2-0007.csv', 'rb') as csvfile:
	reader = csv.reader(csvfile)
	reader.next();
	for row in reader:
		#print row
		artist = row[0]
		title = row[1]
		score = row[4]

		all_chart_info[(artist,title)] = score;
"""
Example #27
0
    # get params
    hdf5path = sys.argv[1]
    songidx = 0
    if len(sys.argv) > 2:
        songidx = int(sys.argv[2])
    onegetter = ''
    if len(sys.argv) > 3:
        onegetter = sys.argv[3]

    # sanity check
    if not os.path.isfile(hdf5path):
        print 'ERROR: file', hdf5path, 'does not exist.'
        sys.exit(0)
    h5 = hdf5_getters.open_h5_file_read(hdf5path)
    numSongs = hdf5_getters.get_num_songs(h5)
    if songidx >= numSongs:
        print 'ERROR: file contains only', numSongs
        h5.close()
        sys.exit(0)

    # get all getters
    getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters.remove("get_num_songs")  # special case
    if onegetter == 'num_songs' or onegetter == 'get_num_songs':
        getters = []
    elif onegetter != '':
        if onegetter[:4] != 'get_':
            onegetter = 'get_' + onegetter
        try:
            getters.index(onegetter)
def get_all_rows(basedir, ext='.h5'):
    rows = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            #            print(os.path.join(root, f))
            h5 = hdf5_getters.open_h5_file_read(f)
            num_songs = hdf5_getters.get_num_songs(h5)
            #            print(num_songs)

            for i in range(num_songs):
                print(i)
                obj = {}
                obj['artist_name'] = hdf5_getters.get_artist_name(
                    h5, i).decode('UTF-8')
                obj['artist_familiarity'] = hdf5_getters.get_artist_familiarity(
                    h5, i)
                obj['artist_hotness'] = hdf5_getters.get_artist_hotttnesss(
                    h5, i)
                obj['artist_id'] = hdf5_getters.get_artist_id(
                    h5, i).decode('UTF-8')
                #                obj['artist_mbid']=hdf5_getters.get_artist_mbid(h5,i).decode('UTF-8')
                obj['artist_playmeid'] = hdf5_getters.get_artist_playmeid(
                    h5, i)
                obj['artist_7digitalid'] = hdf5_getters.get_artist_7digitalid(
                    h5, i)
                #                obj['artist_latitude']=hdf5_getters.get_artist_latitude(h5,i)
                #                obj['artist_longitude']=hdf5_getters.get_artist_longitude(h5,i)
                #                obj['artist_location']=hdf5_getters.get_artist_location(h5,i).decode('UTF-8')
                obj['artist_name'] = hdf5_getters.get_artist_name(
                    h5, i).decode('UTF-8')
                obj['release'] = hdf5_getters.get_release(h5,
                                                          i).decode('UTF-8')
                obj['song_hotttnesss'] = hdf5_getters.get_song_hotttnesss(
                    h5, i)
                obj['title'] = hdf5_getters.get_title(h5, i).decode('UTF-8')

                #            obj['artist_terms']=hdf5_getters.get_artist_terms(h5)
                #                obj['artist_terms_freq']=hdf5_getters.get_artist_terms_freq(h5)
                #                obj['artist_terms_weight']=hdf5_getters.get_artist_terms_weight(h5)
                #            obj['audio_md5']=hdf5_getters.get_audio_md5(h5).decode('UTF-8')
                obj['danceability'] = hdf5_getters.get_danceability(h5, i)
                obj['duration'] = hdf5_getters.get_duration(h5, i)
                obj['end_of_fade_in'] = hdf5_getters.get_end_of_fade_in(h5, i)
                obj['energy'] = hdf5_getters.get_energy(h5, i)
                obj['key'] = hdf5_getters.get_key(h5, i)
                obj['key_confidence'] = hdf5_getters.get_key_confidence(h5, i)
                obj['loudness'] = hdf5_getters.get_loudness(h5, i)
                obj['mode'] = hdf5_getters.get_mode(h5, i)
                #            obj['start_of_fade_out']=hdf5_getters.get_start_of_fade_out(h5)
                obj['tempo'] = hdf5_getters.get_tempo(h5, i)
                obj['time_signature'] = hdf5_getters.get_time_signature(h5, i)
                #            obj['time_signature_confidence']=hdf5_getters.get_time_signature_confidence(h5)
                obj['track_id'] = hdf5_getters.get_track_id(h5,
                                                            i).decode('UTF-8')
                #            obj['segments_start']=hdf5_getters.get_segments_start(h5)
                #            obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5)
                #            obj['segments_pitches']=hdf5_getters.get_segments_pitches(h5)
                #            obj['segments_timbre']=hdf5_getters.get_segments_timbre(h5)
                #            obj['segments_loudness_max']=hdf5_getters.get_segments_loudness_max(h5)
                #            obj['segments_loudness_max_time']=hdf5_getters.get_segments_loudness_max_time(h5)
                #            obj['segments_confidence']=hdf5_getters.get_segments_confidence(h5)
                #            obj['segments_loudness_start']=hdf5_getters.get_segments_loudness_start(h5)
                #            obj['sections_start']=hdf5_getters.get_sections_start(h5)
                #            obj['sections_confidence']=hdf5_getters.get_sections_confidence(h5)
                #            obj['beats_start']=hdf5_getters.get_beats_start(h5)
                #            obj['beats_confidence']=hdf5_getters.get_beats_confidence(h5)
                #            obj['bars_start']=hdf5_getters.get_bars_start(h5)
                #            obj['bars_confidence']=hdf5_getters.get_bars_confidence(h5)
                #            obj['tatums_start']=hdf5_getters.get_tatums_start(h5)
                #            obj['artist_mbtags']=hdf5_getters.get_artist_mbtags(h5)
                #            obj['artist_mbtags_count']=hdf5_getters.get_artist_mbtags_count(h5)
                obj['year'] = hdf5_getters.get_year(h5, i)
                rows.append(obj)
        h5.close()
    return rows
Example #29
0
# Ubuntu: you can change the environment variable PYTHONPATH
# in your .bashrc file so you do not have to type these lines
sys.path.append(os.path.join(msd_code_path, 'PythonSrc'))

# imports specific to the MSD
import hdf5_getters as GETTERS

cnt = 0
loops = 0

for alpha in string.ascii_uppercase:
    for root, dirs, files in os.walk('/mnt/million-songs/data/' + alpha):
        files = glob.glob(os.path.join(root, '*' + '.h5'))
        for f in files:
            h5 = GETTERS.open_h5_file_read(f)
            num_songs = GETTERS.get_num_songs(h5)
            print f, num_songs

            for i in range(num_songs):
                analysis_sample_rate = GETTERS.get_analysis_sample_rate(h5, i)
                artist_7digitalid = GETTERS.get_artist_7digitalid(h5, i)
                artist_familiarity = GETTERS.get_artist_familiarity(h5, i)
                artist_hotttnesss = GETTERS.get_artist_hotttnesss(h5, i)
                artist_id = GETTERS.get_artist_id(h5, i)
                artist_latitude = GETTERS.get_artist_latitude(h5, i)
                artist_location = GETTERS.get_artist_location(h5, i)
                artist_longitude = GETTERS.get_artist_longitude(h5, i)
                artist_mbid = GETTERS.get_artist_mbid(h5, i)
                artist_mbtags = ','.join(
                    str(e) for e in GETTERS.get_artist_mbtags(h5, i))  # array
                artist_mbtags_count = ','.join(
Example #30
0
def parse_aggregate_songs(file_name,file_name2,artist_map):
    """
    Given an aggregate filename and artist_map in the format
    {artist_name: {data pertaining to artist}}
    """
    """
    TODO: 
    -this function goes through each song, if artist not in there,
    add all data necesary and add first song info.
    else update any specific song info

    -song info is a map from attributename:[values]
    """
    #artist_map = {}
    h5 = hdf5_getters.open_h5_file_read(file_name)
    numSongs = hdf5_getters.get_num_songs(h5)
    print 'Parsing song file...'
    for i in range(numSongs):
        artist_name = hdf5_getters.get_artist_name(h5,i)

        #Filter location
        longi = hdf5_getters.get_artist_longitude(h5,i)
        lat = hdf5_getters.get_artist_latitude(h5,i)
        loc = hdf5_getters.get_artist_location(h5,i)
        if math.isnan(lat) or math.isnan(longi):
            #skip if no location
            continue

        #filter year
        yr = hdf5_getters.get_year(h5,i)
        if yr == 0:
            #skip if no year
            continue

        #filter hotttness and familiarity
        familiarity = hdf5_getters.get_artist_familiarity(h5,i)
        hotttness = hdf5_getters.get_artist_hotttnesss(h5,i)
        if familiarity<=0.0 or hotttness<=0.0:
            #skip if no hotttness or familiarity computations
            continue

        #TODO:MAYBE filter on dance and energy
        timbre = hdf5_getters.get_segments_timbre(h5,i)
        #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each
        if not artist_name in artist_map:
            #have not encountered the artist yet, so populate new map
            sub_map = {}
            sub_map['artist_familiarity'] = familiarity
            sub_map['artist_hotttnesss'] = hotttness
            sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i)
            #longi = hdf5_getters.get_artist_longitude(h5,i)
            #lat = hdf5_getters.get_artist_latitude(h5,i)
            #longi = None if math.isnan(longi) else longi
            #lat = None if math.isnan(lat) else lat
            sub_map['artist_latitude'] = lat
            sub_map['artist_longitude'] = longi
            sub_map['artist_location'] = loc
            sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i)
            #TODO:see if should weight by freq or weight for if the term matches one of the feature terms
            sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i))
            sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i))

            #song-sepcific data
            #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA:
            #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            sub_map['danceability'] = [dance]
            sub_map['duration'] = [hdf5_getters.get_duration(h5,i)]
            sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)]
            sub_map['energy'] = [energy]
            #since each song has a key, ask if feature for keys should be num of songs that appear in that key or
            #just binary if any of their songs has that key or just be avg of songs with that key
            #same for mode, since its either major or minor...should it be count or avg.?
            sub_map['key'] = [hdf5_getters.get_key(h5,i)]
            sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)]
            sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            sub_map['song_hotttnesss'] = [s_hot]
            sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)]
            sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)]
            #should time signature be count as well? binary?
            sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)]
            sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)]
            #should year be binary since they can have many songs across years and should it be year:count
            sub_map['year'] = [yr]

            artist_map[artist_name] = sub_map
        else:
            #artist already exists, so get its map and update song fields
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            artist_map[artist_name]['danceability'].append(dance)
            artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i))
            artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i))
            artist_map[artist_name]['energy'].append(energy)
            artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i))
            artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i))
            artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            artist_map[artist_name]['song_hotttnesss'].append(s_hot)
            artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i))
            artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i))
            #should time signature be count as well? binary?
            artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i))
            artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i))
            #should year be binary since they can have many songs across years and should it be year:count
            artist_map[artist_name]['year'].append(yr)

    h5 = hdf5_getters.open_h5_file_read(file_name2)
    numSongs = hdf5_getters.get_num_songs(h5)
    print 'Parsing song file2...'
    for i in range(numSongs):
        song_id = hdf5_getters.get_track_id(h5,i)
        artist_name = hdf5_getters.get_artist_name(h5,i)
        if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']:
            continue

        #Filter location
        longi = hdf5_getters.get_artist_longitude(h5,i)
        lat = hdf5_getters.get_artist_latitude(h5,i)
        loc = hdf5_getters.get_artist_location(h5,i)
        if math.isnan(lat) or math.isnan(longi):
            #skip if no location
            continue

        #filter year
        yr = hdf5_getters.get_year(h5,i)
        if yr == 0:
            #skip if no year
            continue

        #filter hotttness and familiarity
        familiarity = hdf5_getters.get_artist_familiarity(h5,i)
        hotttness = hdf5_getters.get_artist_hotttnesss(h5,i)
        if familiarity<=0.0 or hotttness<=0.0:
            #skip if no hotttness or familiarity computations
            continue

        #TODO:MAYBE filter on dance and energy
        timbre = hdf5_getters.get_segments_timbre(h5,i)
        #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each
        if not artist_name in artist_map:
            #have not encountered the artist yet, so populate new map
            sub_map = {}
            sub_map['artist_familiarity'] = familiarity
            sub_map['artist_hotttnesss'] = hotttness
            sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i)
            #longi = hdf5_getters.get_artist_longitude(h5,i)
            #lat = hdf5_getters.get_artist_latitude(h5,i)
            #longi = None if math.isnan(longi) else longi
            #lat = None if math.isnan(lat) else lat
            sub_map['artist_latitude'] = lat
            sub_map['artist_longitude'] = longi
            sub_map['artist_location'] = loc
            sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i)
            #TODO:see if should weight by freq or weight for if the term matches one of the feature terms
            sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i))
            sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i))

            #song-sepcific data
            #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA:
            #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            sub_map['danceability'] = [dance]
            sub_map['duration'] = [hdf5_getters.get_duration(h5,i)]
            sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)]
            sub_map['energy'] = [energy]
            #since each song has a key, ask if feature for keys should be num of songs that appear in that key or
            #just binary if any of their songs has that key or just be avg of songs with that key
            #same for mode, since its either major or minor...should it be count or avg.?
            sub_map['key'] = [hdf5_getters.get_key(h5,i)]
            sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)]
            sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            sub_map['song_hotttnesss'] = [s_hot]
            sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)]
            sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)]
            #should time signature be count as well? binary?
            sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)]
            sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)]
            #should year be binary since they can have many songs across years and should it be year:count
            sub_map['year'] = [yr]

            artist_map[artist_name] = sub_map
        else:
            #artist already exists, so get its map and update song fields
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            artist_map[artist_name]['danceability'].append(dance)
            artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i))
            artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i))
            artist_map[artist_name]['energy'].append(energy)
            artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i))
            artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i))
            artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            artist_map[artist_name]['song_hotttnesss'].append(s_hot)
            artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i))
            artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i))
            #should time signature be count as well? binary?
            artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i))
            artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i))
            #should year be binary since they can have many songs across years and should it be year:count
            artist_map[artist_name]['year'].append(yr)
def hd5_single_random_file_parser():
    # Open an h5 file in read mode
    h5 = hdf5_getters.open_h5_file_read(
        '/home/skalogerakis/Documents/MillionSong/MillionSongSubset/A/M/G/TRAMGDX12903CEF79F.h5'
    )

    function_tracker = filter(
        lambda x: x.startswith('get'),
        hdf5_getters.__dict__.keys())  # Detects all the getter functions

    for f in function_tracker:  # Print everything in function tracker
        print(f)

    # First effort to check what each field contains.
    print()  # 55 available fields (exluding number of songs fields)
    print("Num of songs -- ",
          hdf5_getters.get_num_songs(h5))  # One song per file
    print("Title -- ",
          hdf5_getters.get_title(h5))  # Print the title of a specific h5 file
    print("Artist familiarity -- ", hdf5_getters.get_artist_familiarity(h5))
    print("Artist hotness -- ", hdf5_getters.get_artist_hotttnesss(h5))
    print("Artist ID -- ", hdf5_getters.get_artist_id(h5))
    print("Artist mbID -- ", hdf5_getters.get_artist_mbid(h5))
    print("Artist playmeid -- ", hdf5_getters.get_artist_playmeid(h5))
    print("Artist 7DigitalID -- ", hdf5_getters.get_artist_7digitalid(h5))
    print("Artist latitude -- ", hdf5_getters.get_artist_latitude(h5))
    print("Artist longitude -- ", hdf5_getters.get_artist_longitude(h5))
    print("Artist location -- ", hdf5_getters.get_artist_location(h5))
    print("Artist Name -- ", hdf5_getters.get_artist_name(h5))
    print("Release -- ", hdf5_getters.get_release(h5))
    print("Release 7DigitalID -- ", hdf5_getters.get_release_7digitalid(h5))
    print("Song ID -- ", hdf5_getters.get_song_id(h5))
    print("Song Hotness -- ", hdf5_getters.get_song_hotttnesss(h5))
    print("Track 7Digital -- ", hdf5_getters.get_track_7digitalid(h5))
    print("Similar artists -- ", hdf5_getters.get_similar_artists(h5))
    print("Artist terms -- ", hdf5_getters.get_artist_terms(h5))
    print("Artist terms freq -- ", hdf5_getters.get_artist_terms_freq(h5))
    print("Artist terms weight -- ", hdf5_getters.get_artist_terms_weight(h5))
    print("Analysis sample rate -- ",
          hdf5_getters.get_analysis_sample_rate(h5))
    print("Audio md5 -- ", hdf5_getters.get_audio_md5(h5))
    print("Danceability -- ", hdf5_getters.get_danceability(h5))
    print("Duration -- ", hdf5_getters.get_duration(h5))
    print("End of Fade -- ", hdf5_getters.get_end_of_fade_in(h5))
    print("Energy -- ", hdf5_getters.get_energy(h5))
    print("Key -- ", hdf5_getters.get_key(h5))
    print("Key Confidence -- ", hdf5_getters.get_key_confidence(h5))
    print("Loudness -- ", hdf5_getters.get_loudness(h5))
    print("Mode -- ", hdf5_getters.get_mode(h5))
    print("Mode Confidence -- ", hdf5_getters.get_mode_confidence(h5))
    print("Start of fade out -- ", hdf5_getters.get_start_of_fade_out(h5))
    print("Tempo -- ", hdf5_getters.get_tempo(h5))
    print("Time signature -- ", hdf5_getters.get_time_signature(h5))
    print("Time signature confidence -- ",
          hdf5_getters.get_time_signature_confidence(h5))
    print("Track ID -- ", hdf5_getters.get_track_id(h5))
    print("Segments Start -- ", hdf5_getters.get_segments_start(h5))
    print("Segments Confidence -- ", hdf5_getters.get_segments_confidence(h5))
    print("Segments Pitches -- ", hdf5_getters.get_segments_pitches(h5))
    print("Segments Timbre -- ", hdf5_getters.get_segments_timbre(h5))
    print("Segments Loudness max -- ",
          hdf5_getters.get_segments_loudness_max(h5))
    print("Segments Loudness max time-- ",
          hdf5_getters.get_segments_loudness_max_time(h5))
    print("Segments Loudness start -- ",
          hdf5_getters.get_segments_loudness_start(h5))
    print("Sections start -- ", hdf5_getters.get_sections_start(h5))
    print("Sections Confidence -- ", hdf5_getters.get_sections_confidence(h5))
    print("Beats start -- ", hdf5_getters.get_beats_start(h5))
    print("Beats confidence -- ", hdf5_getters.get_beats_confidence(h5))
    print("Bars start -- ", hdf5_getters.get_bars_start(h5))
    print("Bars confidence -- ", hdf5_getters.get_bars_confidence(h5))
    print("Tatums start -- ", hdf5_getters.get_tatums_start(h5))
    print("Tatums confidence -- ", hdf5_getters.get_tatums_confidence(h5))
    print("Artist mbtags -- ", hdf5_getters.get_artist_mbtags(h5))
    print("Artist mbtags count -- ", hdf5_getters.get_artist_mbtags_count(h5))
    print("Year -- ", hdf5_getters.get_year(h5))

    fields = ['Title', 'Artist ID']

    with open('Tester2.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=';')

        # writing the fields
        csv_writer.writerow(fields)

        # writing the data rows
        csv_writer.writerow(
            [hdf5_getters.get_title(h5),
             hdf5_getters.get_artist_id(h5)])

    h5.close()  # close h5 when completed in the end
Example #32
0
import hdf5_getters

h5 = hdf5_getters.open_h5_file_read("data/msd_summary_file.h5")
for i in range(hdf5_getters.get_num_songs(h5)):
    print(hdf5_getters.get_song_id(h5, i))
h5.close()
Example #33
0
def main():
    outputFile1 = open('SongCSV.csv', 'w')
    csvRowString = ""

    #################################################
    #if you want to prompt the user for the order of attributes in the csv,
    #leave the prompt boolean set to True
    #else, set 'prompt' to False and set the order of attributes in the 'else'
    #clause
    prompt = False
    #################################################
    if prompt == True:
        while prompt:

            prompt = False

            csvAttributeString = raw_input(
                "\n\nIn what order would you like the colums of the CSV file?\n"
                + "Please delineate with commas. The options are: " +
                "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"
                +
                " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo,"
                +
                " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n"
                +
                "For example, you may write \"Title, Tempo, Duration\"...\n\n"
                + "...or exit by typing 'exit'.\n\n")

            csvAttributeList = re.split('\W+', csvAttributeString)
            for i, v in enumerate(csvAttributeList):
                csvAttributeList[i] = csvAttributeList[i].lower()

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"

                if attribute == 'AlbumID'.lower():
                    csvRowString += 'AlbumID'
                elif attribute == 'AlbumName'.lower():
                    csvRowString += 'AlbumName'
                elif attribute == 'ArtistID'.lower():
                    csvRowString += 'ArtistID'
                elif attribute == 'ArtistLatitude'.lower():
                    csvRowString += 'ArtistLatitude'
                elif attribute == 'ArtistLocation'.lower():
                    csvRowString += 'ArtistLocation'
                elif attribute == 'ArtistLongitude'.lower():
                    csvRowString += 'ArtistLongitude'
                elif attribute == 'ArtistName'.lower():
                    csvRowString += 'ArtistName'
                elif attribute == 'Danceability'.lower():
                    csvRowString += 'Danceability'
                elif attribute == 'Duration'.lower():
                    csvRowString += 'Duration'
                elif attribute == 'KeySignature'.lower():
                    csvRowString += 'KeySignature'
                elif attribute == 'KeySignatureConfidence'.lower():
                    csvRowString += 'KeySignatureConfidence'
                elif attribute == 'SongID'.lower():
                    csvRowString += "SongID"
                elif attribute == 'Tempo'.lower():
                    csvRowString += 'Tempo'
                elif attribute == 'TimeSignature'.lower():
                    csvRowString += 'TimeSignature'
                elif attribute == 'TimeSignatureConfidence'.lower():
                    csvRowString += 'TimeSignatureConfidence'
                elif attribute == 'Title'.lower():
                    csvRowString += 'Title'
                elif attribute == 'Year'.lower():
                    csvRowString += 'Year'
                elif attribute == 'Exit'.lower():
                    sys.exit()
                else:
                    prompt = True
                    print "=============="
                    print "I believe there has been an error with the input."
                    print "=============="
                    break

                csvRowString += ","

            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex - 1]
            csvRowString += "\n"
            outputFile1.write(csvRowString)
            csvRowString = ""
    #else, if you want to hard code the order of the csv file and not prompt
    #the user,
    else:
        #################################################
        #change the order of the csv file here
        #Default is to list all available attributes (in alphabetical order)
        csvRowString = (
            "SongID,AlbumID,AlbumName,TrackId,ArtistID,ArtistLatitude,ArtistLocation,"
            +
            "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," +
            "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"
            + "Title,Year")
        #################################################

        csvAttributeList = re.split('\W+', csvRowString)
        for i, v in enumerate(csvAttributeList):
            csvAttributeList[i] = csvAttributeList[i].lower()
        outputFile1.write("SongNumber,")
        outputFile1.write(csvRowString + "\n")
        csvRowString = ""

    #################################################

    #Set the basedir here, the root directory from which the search
    #for files stored in a (hierarchical data structure) will originate
    basedir = "/home/umwangye/millonsong/MillionSongSubset/data/"  # "." As the default means the current directory
    ext = ".h5"  #Set the extension here. H5 is the extension for HDF5 files.
    #################################################

    #FOR LOOP
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print f

            songH5File = hdf5_getters.open_h5_file_read(f)
            #song = Song(str(hdf5_getters.get_song_id(songH5File)))

            #testDanceability = hdf5_getters.get_danceability(songH5File)
            # print type(testDanceability)
            # print ("Here is the danceability: ") + str(testDanceability)
            numPerH5 = hdf5_getters.get_num_songs(songH5File)

            for cnt in range(numPerH5):
                song = Song(str(hdf5_getters.get_song_id(songH5File, cnt)))
                song.trackId = str(hdf5_getters.get_track_id(songH5File, cnt))
                song.artistID = str(hdf5_getters.get_artist_id(
                    songH5File, cnt))
                song.albumID = str(
                    hdf5_getters.get_release_7digitalid(songH5File, cnt))
                song.albumName = str(hdf5_getters.get_release(songH5File, cnt))
                song.artistLatitude = str(
                    hdf5_getters.get_artist_latitude(songH5File, cnt))
                song.artistLocation = str(
                    hdf5_getters.get_artist_location(songH5File, cnt))
                song.artistLongitude = str(
                    hdf5_getters.get_artist_longitude(songH5File, cnt))
                song.artistName = str(
                    hdf5_getters.get_artist_name(songH5File, cnt))
                song.danceability = str(
                    hdf5_getters.get_danceability(songH5File, cnt))
                song.duration = str(hdf5_getters.get_duration(songH5File, cnt))
                # song.setGenreList()
                song.keySignature = str(hdf5_getters.get_key(songH5File, cnt))
                song.keySignatureConfidence = str(
                    hdf5_getters.get_key_confidence(songH5File, cnt))
                # song.lyrics = None
                # song.popularity = None
                song.tempo = str(hdf5_getters.get_tempo(songH5File, cnt))
                song.timeSignature = str(
                    hdf5_getters.get_time_signature(songH5File, cnt))
                song.timeSignatureConfidence = str(
                    hdf5_getters.get_time_signature_confidence(
                        songH5File, cnt))
                song.title = str(hdf5_getters.get_title(songH5File, cnt))
                song.year = str(hdf5_getters.get_year(songH5File, cnt))

                #print song count
                csvRowString += str(song.songCount) + ","

                for attribute in csvAttributeList:
                    # print "Here is the attribute: " + attribute + " \n"

                    if attribute == 'AlbumID'.lower():
                        csvRowString += song.albumID
                    elif attribute == 'AlbumName'.lower():
                        albumName = song.albumName
                        albumName = albumName.replace(',', "")
                        csvRowString += "\"" + albumName + "\""
                    elif attribute == 'TrackId'.lower():
                        csvRowString += song.trackId
                    elif attribute == 'ArtistID'.lower():
                        csvRowString += "\"" + song.artistID + "\""
                    elif attribute == 'ArtistLatitude'.lower():
                        latitude = song.artistLatitude
                        if latitude == 'nan':
                            latitude = ''
                        csvRowString += latitude
                    elif attribute == 'ArtistLocation'.lower():
                        location = song.artistLocation
                        location = location.replace(',', '')
                        csvRowString += "\"" + location + "\""
                    elif attribute == 'ArtistLongitude'.lower():
                        longitude = song.artistLongitude
                        if longitude == 'nan':
                            longitude = ''
                        csvRowString += longitude
                    elif attribute == 'ArtistName'.lower():
                        csvRowString += "\"" + song.artistName + "\""
                    elif attribute == 'Danceability'.lower():
                        csvRowString += song.danceability
                    elif attribute == 'Duration'.lower():
                        csvRowString += song.duration
                    elif attribute == 'KeySignature'.lower():
                        csvRowString += song.keySignature
                    elif attribute == 'KeySignatureConfidence'.lower():
                        # print "key sig conf: " + song.timeSignatureConfidence
                        csvRowString += song.keySignatureConfidence
                    elif attribute == 'SongID'.lower():
                        csvRowString += "\"" + song.id + "\""
                    elif attribute == 'Tempo'.lower():
                        # print "Tempo: " + song.tempo
                        csvRowString += song.tempo
                    elif attribute == 'TimeSignature'.lower():
                        csvRowString += song.timeSignature
                    elif attribute == 'TimeSignatureConfidence'.lower():
                        # print "time sig conf: " + song.timeSignatureConfidence
                        csvRowString += song.timeSignatureConfidence
                    elif attribute == 'Title'.lower():
                        csvRowString += "\"" + song.title + "\""
                    elif attribute == 'Year'.lower():
                        csvRowString += song.year

                    else:
                        csvRowString += "Erm. This didn't work. Error. :( :(\n"

                    csvRowString += ","

            #Remove the final comma from each row in the csv
                lastIndex = len(csvRowString)
                csvRowString = csvRowString[0:lastIndex - 1]
                csvRowString += "\n"
                outputFile1.write(csvRowString)
                csvRowString = ""

            songH5File.close()

    outputFile1.close()
Example #34
0
import csv
import math
import hdf5_getters
import operator

print("loading...")
h5 = hdf5_getters.open_h5_file_read("files/msd_summary_file.h5")
data = []

length = hdf5_getters.get_num_songs(h5)

print("number of songs = ", length)

count = 0
for i in range(0, length):
    tmp = []
    if hdf5_getters.get_year(h5, songidx=i) == 0:
        continue
    #if math.isnan(hdf5_getters.get_artist_latitude(h5,songidx=i)) and hdf5_getters.get_artist_location(h5,songidx=i) =='':
    #	continue;
    count += 1
    tmp.append(
        str(hdf5_getters.get_track_id(h5,
                                      songidx=i)).replace("b'",
                                                          "").replace("'", ""))
    tmp.append(hdf5_getters.get_year(h5, songidx=i))
    #0
    tmp.append(hdf5_getters.get_song_hotttnesss(h5, songidx=i))
    #1
    tmp.append(
        str(hdf5_getters.get_title(h5,

getters1 = get_getters(list_attr1)


#run through each .h5 file contained in the folder
progression = 0
interval = 0
for folder, subfolders, files in os.walk(data_folder):
    for f in files:
        if f.endswith('.h5') and not f.startswith('._'):
            #open the hdf5 file
            h5 = hdf5_getters.open_h5_file_read(folder + '/' + f)

            #add one entry line in the database per row contained in the file
            for song_nb in range(hdf5_getters.get_num_songs(h5)):      
                write_line(metadata,getters1,h5,song_nb)
                progression += 1
                interval += 1
                if interval == 1000:
                    print(progression)
                    interval = 0
            h5.close()

metadata.close()

print('nb songs = %d' %progression)
elapsed_time=time.time()-start_time
print('elapsed time = ' + str(elapsed_time) + ' sec')
        
        
def transfer(h5path,matpath=None,force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print 'path to HF5 files does not exist:',h5path
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print 'expecting a .h5 extension for file:',h5path
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if force:
            print 'overwriting file:',matpath
        else:
            print 'matfile',matpath,'already exists (delete or force):'
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters.remove("get_num_songs") # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        # iterate over songs
        for songidx in xrange(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print 'Memory Error with file:',h5path
        print 'All data has to be loaded in memory before being saved as matfile'
        print 'Is this an aggregated / summary file with tons of songs?'
        print 'This code is optimized for files containing one song,'
        print 'but write me an email! (TBM)'
        raise
    finally:
        # close h5
        h5.close()
    # create
    sio.savemat(matpath,matdata)
    # all good
    return True
Example #37
0
def get_all_examples(basedir, genre_dict, ext='.h5'):
    """
    From a base directory, goes through all subdirectories,
    and grabs all songs and their features and puts them into a pandas dataframe
    INPUT
       basedir    - base directory of the dataset
       genre_dict - a dictionary mapping track id to genre based tagraum dataset
       ext        - extension, .h5 by default
    RETURN
       dataframe containing all song examples
    """
    features_vs_genre = pd.DataFrame()

    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        # # count files
        # count += len(files)
        # apply function to all files
        for f in files:
            h5 = GETTERS.open_h5_file_read(f)
            num_songs = GETTERS.get_num_songs(h5)
            for i in range(num_songs):
                if i % 10000 == 0:
                    print(i)
                song_id = GETTERS.get_track_id(h5, i).decode('utf-8')
                if (song_id in genre_dict):
                    genre = genre_dict[song_id]
                    year = GETTERS.get_year(h5, i)
                    duration = GETTERS.get_duration(h5, i)
                    end_of_fade_in = GETTERS.get_end_of_fade_in(h5, i)
                    loudness = GETTERS.get_loudness(h5, i)
                    song_hotttnesss = GETTERS.get_song_hotttnesss(h5, i)
                    tempo = GETTERS.get_tempo(h5, i)
                    key = GETTERS.get_key(h5, i)
                    key_confidence = GETTERS.get_key_confidence(h5, i)
                    mode = GETTERS.get_mode(h5, i)
                    mode_confidence = GETTERS.get_mode_confidence(h5, i)
                    time_signature = GETTERS.get_time_signature(h5, i)
                    time_signature_confidence = GETTERS.get_time_signature_confidence(
                        h5, i)
                    artist_name = GETTERS.get_artist_name(h5)
                    title = GETTERS.get_title(h5)
                    # length of sections_start array gives us number of start
                    # num_sections = len(GETTERS.get_sections_start(h5))
                    # num_segments = len(GETTERS.get_segments_confidence(h5))
                    example = pd.DataFrame(data=[(artist_name, title, song_id,
                                                  genre, year, key,
                                                  key_confidence, mode,
                                                  mode_confidence,
                                                  time_signature,
                                                  time_signature_confidence,
                                                  duration,
                                                  end_of_fade_in, loudness,
                                                  song_hotttnesss, tempo)],
                                           columns=['artist_name', 'title',
                                                    'song_id', 'genre', 'year',
                                                    'key', 'key_confidence',
                                                    'mode', 'mode_confidence',
                                                    'time_signature',
                                                    'time_signature_confidence',
                                                    'duration',
                                                    'end_of_fade_in',
                                                    'loudness',
                                                    'song_hotttnesss',
                                                    'tempo'])
                    features_vs_genre = features_vs_genre.append(example)
            h5.close()

    return features_vs_genre
Example #38
0
def getInfo(files):
    data = []
    build_str = ''
    with open(sys.argv[1], 'r') as f:
        contents = f.read()
        c = contents.split()
    f.close()
    print("creating csv with following fields:" + contents)
    for i in c:
        build_str = build_str + i + ','
    build_str = build_str[:-1]
    build_str = build_str + '\n'
    for fil in files:
        curFile = getters.open_h5_file_read(fil)
        d2 = {}
        get_table = {'track_id': getters.get_track_id(curFile), 'segments_pitches': getters.get_segments_pitches(curFile), 'time_signature_confidence': getters.get_time_signature_confidence(curFile), 'song_hotttnesss': getters.get_song_hotttnesss(curFile), 'artist_longitude': getters.get_artist_longitude(curFile), 'tatums_confidence': getters.get_tatums_confidence(curFile), 'num_songs': getters.get_num_songs(curFile), 'duration': getters.get_duration(curFile), 'start_of_fade_out': getters.get_start_of_fade_out(curFile), 'artist_name': getters.get_artist_name(curFile), 'similar_artists': getters.get_similar_artists(curFile), 'artist_mbtags': getters.get_artist_mbtags(curFile), 'artist_terms_freq': getters.get_artist_terms_freq(curFile), 'release': getters.get_release(curFile), 'song_id': getters.get_song_id(curFile), 'track_7digitalid': getters.get_track_7digitalid(curFile), 'title': getters.get_title(curFile), 'artist_latitude': getters.get_artist_latitude(curFile), 'energy': getters.get_energy(curFile), 'key': getters.get_key(curFile), 'release_7digitalid': getters.get_release_7digitalid(curFile), 'artist_mbid': getters.get_artist_mbid(curFile), 'segments_confidence': getters.get_segments_confidence(curFile), 'artist_hotttnesss': getters.get_artist_hotttnesss(curFile), 'time_signature': getters.get_time_signature(curFile), 'segments_loudness_max_time': getters.get_segments_loudness_max_time(curFile), 'mode': getters.get_mode(curFile), 'segments_loudness_start': getters.get_segments_loudness_start(curFile), 'tempo': getters.get_tempo(curFile), 'key_confidence': getters.get_key_confidence(curFile), 'analysis_sample_rate': getters.get_analysis_sample_rate(curFile), 'bars_confidence': getters.get_bars_confidence(curFile), 'artist_playmeid': getters.get_artist_playmeid(curFile), 'artist_terms_weight': getters.get_artist_terms_weight(curFile), 'segments_start': getters.get_segments_start(curFile), 'artist_location': getters.get_artist_location(curFile), 'loudness': getters.get_loudness(curFile), 'year': getters.get_year(curFile), 'artist_7digitalid': getters.get_artist_7digitalid(curFile), 'audio_md5': getters.get_audio_md5(curFile), 'segments_timbre': getters.get_segments_timbre(curFile), 'mode_confidence': getters.get_mode_confidence(curFile), 'end_of_fade_in': getters.get_end_of_fade_in(curFile), 'danceability': getters.get_danceability(curFile), 'artist_familiarity': getters.get_artist_familiarity(curFile), 'artist_mbtags_count': getters.get_artist_mbtags_count(curFile), 'tatums_start': getters.get_tatums_start(curFile), 'artist_id': getters.get_artist_id(curFile), 'segments_loudness_max': getters.get_segments_loudness_max(curFile), 'bars_start': getters.get_bars_start(curFile), 'beats_start': getters.get_beats_start(curFile), 'artist_terms': getters.get_artist_terms(curFile), 'sections_start': getters.get_sections_start(curFile), 'beats_confidence': getters.get_beats_confidence(curFile), 'sections_confidence': getters.get_sections_confidence(curFile)}
        tid = fil.split('/')[-1].split('.')[0]
        # print(c)
        for i in c:
            if i in get_table: 
               d2[i] = get_table[i]
               d2[i] = str(d2[i]).replace('\n','')  
               build_str = build_str + d2[i] + ','
            else:
                print('error: unspecified field')
                exit(0)
        build_str = build_str[:-1]
        # print(build_str[:-1])
        build_str = build_str + '\n'
        curFile.close()
    build_str = build_str.replace('b','').replace("'",'').replace('"','')  
    return (build_str)