def ingest(self):
        # Browse MSD summary file
        es_bulk_docs = {}
        msd_id = ""
        sng_idx = 0

        for h5_fd, sng_idx in self.track_generator.get_track():
            msd_doc = {}
            msd_id = hdf5_getters.get_track_id(h5_fd, sng_idx)

            for getter in getters:

                field_name = getter.split("get_")[-1]
                msd_field_name = "msd_" + field_name  # prefixed for ES storage
                try:
                    msd_field_value = hdf5_getters.__getattribute__(getter)(h5_fd, sng_idx)

                    # Type conversions
                    msd_field_value = Ingestor.convert_type(msd_field_value)

                    msd_doc[msd_field_name] = msd_field_value
                except AttributeError, e:
                    logger.debug("ERROR. AttributeError. {}".format(e))
                    pass

            es_bulk_docs[msd_id] = msd_doc

            # Ingest bulk if size is enough
            if len(es_bulk_docs) == es_bulk_size:
                logger.debug("{} files read. Bulk ingest.".format(sng_idx + 1))
                logger.debug("Last MSD id read: {}".format(msd_id))
                self.es_helper.ingest_to_es(es_bulk_docs)
                es_bulk_docs = {}
Beispiel #2
0
def main():
    if len(sys.argv) != 2:
        print ('Takes one argument, the directory with the data files.')
        return
    
    hdf5_files = get_all_files(sys.argv[1])
    
    # Define properties to get
    properties = ['danceability', 'duration', 'end_of_fade_in', 'energy', 'key', 'loudness', 'mode', 'song_hotttnesss', 'start_of_fade_out', 'tempo', 'time_signature', 'year', 'artist_terms']
    
    count_datapoints = 0
    with open('data.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(properties)
        
        for abspath in hdf5_files:
            prop_values = []
            h5 = hdf5_getters.open_h5_file_read(abspath)
            hotness_check = hdf5_getters.get_song_hotttnesss(h5)
            if math.isnan(hotness_check):
                h5.close()
                continue
            for prop in properties:        
                prop_value = hdf5_getters.__getattribute__('get_' + prop)(h5)              
                #print (prop_value)
                # special case artist terms to format it 
                if prop == 'artist_terms':
                    prop_values.append(';'.join(str(x) for x in prop_value))
                else:
                    prop_values.append(str(prop_value))          
            csvwriter.writerow(prop_values)         
            count_datapoints += 1
            h5.close();
    print('Wrote % datapoints (lines)'.format(count_datapoints))
    sys.exit(0)
Beispiel #3
0
def sanity_check_1thread(maindir=None, threadid=-1, nthreads=-1, allfiles=[]):
    """
    Main function, check a bunch of files by opening every field in
    getter.
    """
    assert not maindir is None, 'wrong param maindir'
    assert threadid > -1, 'wrong param threadid'
    assert nthreads > 0, 'wrong param nthreads'
    assert len(allfiles) > 0, 'wrong param allfiles, or no files'
    # get getters
    getters = filter(lambda x: x[:4] == 'get_', GETTERS.__dict__.keys())
    # get the files to check
    files_per_thread = int(np.ceil(len(allfiles) * 1. / nthreads))
    p1 = files_per_thread * threadid
    p2 = min(len(allfiles), files_per_thread * (threadid + 1))
    # iterate over files between p1 and p2
    for f in allfiles[p1:p2]:
        try:
            h5 = GETTERS.open_h5_file_read(f)
            for getter in getters:
                tmp = GETTERS.__getattribute__(getter)(h5)
        except KeyboardInterrupt:
            raise KeyboardInterruptError()
        except Exception, e:
            print 'PROBLEM WITH FILE:', f
            sys.stdout.flush()
            raise
        finally:
Beispiel #4
0
def sanity_check_1thread(maindir=None,threadid=-1,nthreads=-1,allfiles=[]):
    """
    Main function, check a bunch of files by opening every field in
    getter.
    """
    assert not maindir is None,'wrong param maindir'
    assert threadid>-1,'wrong param threadid'
    assert nthreads>0,'wrong param nthreads'
    assert len(allfiles)>0,'wrong param allfiles, or no files'
    # get getters
    getters = filter(lambda x: x[:4] == 'get_', GETTERS.__dict__.keys())
    # get the files to check
    files_per_thread = int(np.ceil(len(allfiles) * 1. / nthreads))
    p1 = files_per_thread * threadid
    p2 = min(len(allfiles),files_per_thread * (threadid+1))
    # iterate over files between p1 and p2
    for f in allfiles[p1:p2]:
        try:
            h5 = GETTERS.open_h5_file_read(f)
            for getter in getters:
                tmp = GETTERS.__getattribute__(getter)(h5)
        except KeyboardInterrupt:
            raise KeyboardInterruptError()
        except Exception,e:
            print 'PROBLEM WITH FILE:',f; sys.stdout.flush()
            raise
        finally:
Beispiel #5
0
def get(getters, h5file):
	# sanity check
	if not os.path.isfile(h5file):
		print 'ERROR: file', h5file, 'does not exist.'
		sys.exit(0)
	h5 = hdf5_getters.open_h5_file_read(h5file)
	numSongs = hdf5_getters.get_num_songs(h5)
	songidx = 0
	if songidx >= numSongs:
		print 'ERROR: file contains only',numSongs
		h5.close()
		sys.exit(0)

	line = dict()
	for getter in getters:
		try:
			res = hdf5_getters.__getattribute__('get_' + getter)(h5,songidx)
		except AttributeError, e:
				print e
		if res.__class__.__name__ == 'ndarray':
			# print getter[4:]+": shape =",res.shape
			# How to put multidimensional values into file. 
			# Try to put only mean of the values etc...
			print 'Ignoring....'
		else:
			# print getter[4:]+":",res
			line[getter] = res
Beispiel #6
0
def extractSongData(file_name, getters_to_apply):
    path = './canciones/' + file_name + '.h5'
    h5 = getters.open_h5_file_read(path)
    song = np.empty(0)
    for get in getters_to_apply:
        res = getters.__getattribute__(get)(h5)
        song = np.append(song, np.mean(res))
    h5.close()
    return song
def extractValues(hdf5path, summary, fields):

#    summary = False
    
    songidx = 0
    onegetter = ''
#    print hdf5path
    h5 = hdf5_getters.open_h5_file_read(hdf5path)

    # get all getters
    keys = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters = []
    
    keys.remove("get_num_songs") # special case
    for onegetter in fields:
        if onegetter[:4] != 'get_':
            onegetter = 'get_' + onegetter #add get_
            try:
                keys.index(onegetter) #find if keyval exists else exit
            except ValueError:
                print 'ERROR: getter requested:',onegetter,'does not exist.'
                h5.close()
                sys.exit(0)
            getters.append(onegetter)
    
    getters = np.sort(getters)
    
    retDict = {}

    # print them
    for getter in getters:
        try:
            res = hdf5_getters.__getattribute__(getter)(h5,songidx)
        except AttributeError, e:
            if summary:
                continue
            else:
                print e
                print 'forgot -summary flag? specified wrong getter?'
        
        #print getter + "\n"
        #print res
        if res.__class__.__name__ == 'float64':
            if math.isnan(res):
                res = Global_Constant
                
        if res.__class__.__name__ == 'ndarray':
            #print getter[4:]+": shape =",res.shape
            newlist = []            
            for i in res:
                newlist.append(i)
            #print newlist 
            
            retDict[getter[4:]] = newlist
            
        else:
            retDict[getter[4:]] = res
Beispiel #8
0
def get_attributes(files, getters):
  for getter in getters:
    getter_func = hdf5_getters.__getattribute__(getter)
    attrib = []
    for f in files:
      h5 = hdf5_getters.open_h5_file_read(f)
      attrib.append( getter_func(h5) )
      h5.close()
    yield getter, attrib
Beispiel #9
0
def get_list_attr(path_list, attr):
    attr_list = []
    i = 1
    for file in path_list:
        try:
            file_read = hdf5_getters.open_h5_file_read(file)
            attr_list.append(hdf5_getters.__getattribute__(attr)(file_read))
            file_read.close()
            print 'Finished ' + str(i) + '/2350'
            i += 1
        except:
            print '---- Failed to get ' + file + ' ---- No:' + str(i)
            attr_list.append(0)
            i += 1
    return (attr_list)
def get_song_info(song_path, pickle_path):

    #Create a dictionary with fields and dump in pickle
    data = {}
    data['pickle_id'] = get_song_id(song_path)
    #print data['pickle_id']

    # get params
    hdf5path = song_path
    songidx = 0
    onegetter = ''

    # if len(sys.argv) > 2:
    #     songidx = int(sys.argv[2])
    # if len(sys.argv) > 3:
    #     onegetter = sys.argv[3]

    # sanity check
    if not os.path.isfile(hdf5path):
        print 'ERROR: file', hdf5path, 'does not exist.'
        sys.exit(0)
    h5 = hdf5_getters.open_h5_file_read(hdf5path)
    numSongs = hdf5_getters.get_num_songs(h5)
    if songidx >= numSongs:
        print 'ERROR: file contains only', numSongs
        h5.close()
        sys.exit(0)

    # get all getters
    getters = get_modified_getters()
    #print getters

    # print them
    for getter in getters:
        try:
            res = hdf5_getters.__getattribute__(getter)(h5, songidx)
        except AttributeError, e:
            if summary:
                continue
            else:
                print e
                print 'forgot -summary flag? specified wrong getter?'
        if res.__class__.__name__ == 'ndarray':
            print getter[4:] + ": shape =", res.shape
        else:
            data[getter[4:]] = str(res)
Beispiel #11
0
def main():
    if len(sys.argv) != 2:
        print('Takes one argument, the directory with the data files.')
        return

    hdf5_files = get_all_files(sys.argv[1])

    # Define properties to get
    properties = [
        'danceability', 'duration', 'end_of_fade_in', 'energy', 'key',
        'loudness', 'mode', 'song_hotttnesss', 'start_of_fade_out', 'tempo',
        'time_signature', 'year', 'artist_terms'
    ]

    count_datapoints = 0
    with open('data.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(properties)

        for abspath in hdf5_files:
            prop_values = []
            h5 = hdf5_getters.open_h5_file_read(abspath)
            hotness_check = hdf5_getters.get_song_hotttnesss(h5)
            if math.isnan(hotness_check):
                h5.close()
                continue
            for prop in properties:
                prop_value = hdf5_getters.__getattribute__('get_' + prop)(h5)
                #print (prop_value)
                # special case artist terms to format it
                if prop == 'artist_terms':
                    prop_values.append(';'.join(str(x) for x in prop_value))
                else:
                    prop_values.append(str(prop_value))
            csvwriter.writerow(prop_values)
            count_datapoints += 1
            h5.close()
    print('Wrote % datapoints (lines)'.format(count_datapoints))
    sys.exit(0)
Beispiel #12
0
def main():
    rootdir = '/Users/Jerry/desktop/2017_Spring/COSI 132A/project/MillionSongSubset/data'
    number = 0
    for subdir, dirs, files in os.walk(rootdir):
        for f in files:
            fileroot = os.path.join(subdir, f)
            h5 = hdf5_getters.open_h5_file_read(fileroot)
            # numSongs = hdf5_getters.get_num_songs(h5)
            # if numSongs>1:
            #     print fileroot, numSongs, "\n"
            # above code has checked song is 1 for all h5 file
            getters = filter(lambda x: x[:4] == 'get_',
                             hdf5_getters.__dict__.keys())

            getters = np.sort(getters)
            songidx = 0
            number = number + 1
            song = {}
            # print them
            for getter in getters:
                try:
                    res = hdf5_getters.__getattribute__(getter)(h5, songidx)
                except AttributeError, e:
                    if summary:
                        continue
                    else:
                        print e
                        print 'forgot -summary flag? specified wrong getter?'
                if res.__class__.__name__ == 'ndarray':
                    song[getter[4:]] = list(res)
                    # print getter[4:]+": shape =",res.shape
                else:
                    # print getter[4:]+":",res
                    song[getter[4:]] = str(res)
                # song[getter[4:]] = res

            h5.close()
            music[number] = song
Beispiel #13
0
    elif onegetter != '':
        if onegetter[:4] != 'get_':
            onegetter = 'get_' + onegetter
        try:
            getters.index(onegetter)
        except ValueError:
            print 'ERROR: getter requested:', onegetter, 'does not exist.'
            h5.close()
            sys.exit(0)
        getters = [onegetter]
    getters = np.sort(getters)

    # print them
    for getter in getters:
        try:
            res = hdf5_getters.__getattribute__(getter)(h5, songidx)
        except AttributeError, e:
            if summary:
                continue
            else:
                print e
                print 'forgot -summary flag? specified wrong getter?'
        if res.__class__.__name__ == 'ndarray':
            #print getter[4:]+": shape =",res.shape
            d[getter[4:]] = res.tolist()
        else:
            #print getter[4:]+":",res
            if res != res:
                res = None
            d[getter[4:]] = res
Beispiel #14
0
		for filed in files:
			h5 = hdf5_getters.open_h5_file_read(root+"/"+filed)

			# get all getters
			keys_to_extract = ["get_song_id","get_title", "get_track_id", "get_artist_id", "get_artist_name", "get_duration", "get_year", "get_artist_location", "get_artist_familiarity", "get_artist_hotttnesss", "get_loudness"]
			getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
			getters.remove("get_num_songs") # special case
			
			getters = list(set(getters).intersection(set(keys_to_extract)))
			getters.sort(key=lambda x: keys_to_extract.index(x))

			# write extracted data to file
			csvstring = []
			for getter in getters:
				try:
					res = hdf5_getters.__getattribute__(getter)(h5,songidx)

					if getter == "get_song_id":
						if res in song_dict:
							song_numeric_id = song_dict[res]
						else:
							ids += 1
							song_numeric_id = song_dict[res] = ids

						csvstring.append(song_numeric_id)

				except AttributeError, e:
					continue
				if res.__class__.__name__ == 'ndarray':
					continue
Beispiel #15
0
    song_order[song[0]] = song[1]

outputDir = output_dir
i = 0
hits = 0

for dirpath, dirnames, filenames in os.walk(input_dir):
    for track_file in filenames:
        #print track_file
        #song = re.split(r'[ ]', songs[i])
        output = "<song xmlns=\'http://labrosa.ee.columbia.edu/millionsong/\'>\n"
        h5 = hdf5_getters.open_h5_file_read(os.path.join(dirpath, track_file))
        song_id = hdf5_getters.get_song_id(h5)
        for getter in getters:
            try:
                res = hdf5_getters.__getattribute__(getter)(h5)
            except AttributeError, e:
                continue
            if res.__class__.__name__ == 'ndarray':
                output = output + "<" + getter[4:] + ">" + str(
                    res.shape) + "</" + getter[4:] + ">\n"
            else:
                output = output + "<" + getter[4:] + ">" + str(
                    res) + "</" + getter[4:] + ">\n"
        h5.close()
        if song_id in song_order:
            output = output + "<order>" + song_order[
                song_id][:-1] + "</order>\n"
            logger.debug(track_file + ' HIT')
            hits = hits + 1
Beispiel #16
0
def transfer(h5path, matpath=None, force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print 'path to HF5 files does not exist:', h5path
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print 'expecting a .h5 extension for file:', h5path
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if force:
            print 'overwriting file:', matpath
        else:
            #            print 'matfile',matpath,'already exists (delete or force):'
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters.remove("get_num_songs")  # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {
        'transfer_note':
        'transferred on ' + time.ctime() + ' from file: ' + h5path
    }
    try:
        # iterate over songs
        for songidx in xrange(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx + 1)
                data = hdf5_getters.__getattribute__(getter)(h5, songidx)
                matdata[gettername] = data
    except MemoryError:
        print 'Memory Error with file:', h5path
        print 'All data has to be loaded in memory before being saved as matfile'
        print 'Is this an aggregated / summary file with tons of songs?'
        print 'This code is optimized for files containing one song,'
        print 'but write me an email! (TBM)'
        raise
    finally:
        # close h5
        h5.close()
    # create
    sio.savemat(matpath, matdata)
    # all good
    return True
Beispiel #17
0
 
             if numSongs>1:
                 print "Error: More than one song is included in file ", filename
                 f.close()
                 sys.exit(0)
             
             getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
             getters.remove("get_num_songs") # special case
             getters = np.sort(getters)
 
             dict_get ={'get_track_id':'','get_track_7digitalid':'','get_title':'','get_artist_id':'','get_artist_7digitalid':'','get_artist_name':'','get_artist_hotttnesss':'','get_artist_latitude':'','get_artist_location':'','get_artist_longitude':'','get_danceability':'','get_duration':'','get_energy':'','get_loudness':'','get_release':'','get_release_7digitalid':'','get_song_hotttnesss':'','get_song_id':'','get_tempo':'','get_time_signature':'','get_time_signature_confidence':'','get_year':''}
             # print them
             for getter in getters:
             	if getter in sel_get:
             		try:
             		    dict_get[getter] = hdf5_getters.__getattribute__(getter)(h5,songidx)
             		except AttributeError, e:
             		    continue
             
             f.write(str(dict_get['get_track_id']) + '\t')
             f.write(str(dict_get['get_track_7digitalid']) + '\t')
             f.write(str(dict_get['get_title']) + '\t')
             f.write(str(dict_get['get_artist_id']) + '\t')
             f.write(str(dict_get['get_artist_7digitalid']) + '\t')
             f.write(str(dict_get['get_artist_name']) + '\t')
             f.write(str(dict_get['get_artist_hotttnesss']) + '\t')
             f.write(str(dict_get['get_artist_latitude']) + '\t' )
             f.write(str(dict_get['get_artist_location']) + '\t' )
             f.write(str(dict_get['get_artist_longitude']) + '\t')
             f.write(str(dict_get['get_danceability']) + '\t' )
             f.write(str(dict_get['get_duration']) + '\t' )
    freq = {} #2-dim dict of dict's represented as freq[year][term]
    uniqueWordLst = [] #uniqueVector of unique words to loop
    yearLst = [] #Vector of all years
    for hdf5path in allh5:
        #params
        songidx = 0

        #sanity check (use when dir scanning works)
        if not os.path.isfile(hdf5path):
        	print 'ERROR: file',hdf5path,'does not exist.'
    		continue

    	#PRINT THE FREQUENCY LIST
        h5 = hdf5_getters.open_h5_file_read(hdf5path)
        artist_terms = hdf5_getters.__getattribute__('get_artist_terms')(h5,0)
        year = hdf5_getters.__getattribute__('get_year')(h5,0)

      	#fill up the freq dict
        if(year != 0):
        	freq[year] = []
        	if(not(year in yearLst)):
        		yearLst.append(year)
        	for words in artist_terms:
        		words =  words.split()
        		for word in words:
        			if(not(word in uniqueWordLst)):
        				uniqueWordLst.append(word)
            		incrWordFreq(freq[year], word)
	        
        h5.close()
def transfer(h5path,matpath=None,force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print 'path to HF5 files does not exist:',h5path
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print 'expecting a .h5 extension for file:',h5path
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if force:
            print 'overwriting file:',matpath
        else:
            print 'matfile',matpath,'already exists (delete or force):'
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys())
    getters.remove("get_num_songs") # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        # iterate over songs
        for songidx in xrange(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print 'Memory Error with file:',h5path
        print 'All data has to be loaded in memory before being saved as matfile'
        print 'Is this an aggregated / summary file with tons of songs?'
        print 'This code is optimized for files containing one song,'
        print 'but write me an email! (TBM)'
        raise
    finally:
        # close h5
        h5.close()
    # create
    sio.savemat(matpath,matdata)
    # all good
    return True
Beispiel #20
0
    song_order[song[0]] = song[1]

outputDir = output_dir
i = 0
hits = 0

for dirpath, dirnames, filenames in os.walk(input_dir):
    for track_file in filenames:
        #print track_file
        #song = re.split(r'[ ]', songs[i])
        output = "<song xmlns=\'http://labrosa.ee.columbia.edu/millionsong/\'>\n"
        h5 = hdf5_getters.open_h5_file_read(os.path.join(dirpath, track_file))
        song_id = hdf5_getters.get_song_id(h5)
        for getter in getters:
            try:
                res = hdf5_getters.__getattribute__(getter)(h5)
            except AttributeError, e:
                continue 
            if res.__class__.__name__ == 'ndarray':
                output = output + "<"+getter[4:]+">"+str(res.shape)+"</"+getter[4:]+">\n"
            else:
                output = output + "<"+getter[4:]+">"+str(res)+"</"+getter[4:]+">\n"
        h5.close()
        if song_id in song_order:
            output = output + "<order>" + song_order[song_id][:-1] + "</order>\n"
            logger.debug(track_file +' HIT')
            hits = hits + 1     
        
        if song_id in listen_dict:
            logger.debug("user listens: " + track_file)
            for user_listen in listen_dict[song_id]: