def parse_file(file, targetfile): print "print parsing %s" %file with open(file, 'r') as f: global tracks, labels, features tracks = list() labels = list() for line in f: track, label = line.strip().split(' ') tracks.append(track) labels.append(int(label)) labels = np.array(labels) features = np.empty((len(tracks),NUM_FEATURES),dtype='float')#hardcode number of features otherwise problem with scaling track_info = np.empty((len(tracks)),dtype='object') for i,track in enumerate(tracks): #print track +' - ' +ut.get_track_info(track) if(i%100 ==0): print "processing track:%s\t%s" %(str(i),str(track)) if(file == SOURCE_DATA_FILE): #fetch h5 file from small dataset h5 = GETTERS.open_h5_file_read("../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5") #fetch h5 file to allow faster preprocessing else: h5 = GETTERS.open_h5_file_read("../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5") #fetch h5 file to allow faster preprocessing track_info[i] = ut.get_track_info(track,h5) timbre = ut.get_timbre(track,h5) #returns a tuple with 5 elements (12*5 = 60) tempo = ut.get_tempo_feature(track,h5) #(1) loudness = ut.get_loudness(track,h5)#returns a tuple with 3 elements (3) energy = ut.get_energy_feature(track) #(1) pitches = ut.get_pitches(track, h5) #(12) features[i] = np.concatenate((timbre[0], timbre[1], timbre[2], timbre[3], timbre[4],pitches[0], pitches[1],pitches[2], pitches[3],pitches[4],np.array([tempo]),np.array(loudness),np.array([energy]))) h5.close() print "done parsing" print "saving data" data = { 'features': features, 'labels': labels, 'tracks': tracks, #(songindex, songid) 'track_titles': track_info } with open(targetfile, 'w') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) print "data saved to %s" % targetfile #SCRIPT #parse_file(SOURCE_DATA_FILE_2,E_TARGET_DATA_FILE_2) #parse_file(SOURCE_DATA_FILE,E_TARGET_DATA_FILE)
timbre_min = np.zeros((len(tracks),12)) timbre_max = np.zeros((len(tracks),12)) pitches_means = np.zeros((len(tracks),12)) pitches_vars = np.zeros((len(tracks),12)) pitches_median = np.zeros((len(tracks),12)) pitches_min = np.zeros((len(tracks),12)) pitches_max = np.zeros((len(tracks),12)) for idx,track in enumerate(tracks): h5 = GETTERS.open_h5_file_read("../../msd_dense_subset/mood/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5") #fetch h5 file to allow faster preprocessing keys[idx], modes[idx]= ut.get_key_feature(track,h5) loudnesses[idx], loudnesses_var[idx], loudnesses_interval[idx] = ut.get_loudness(track,h5) tempos[idx] = ut.get_tempo_feature(track,h5) time_signatures[idx] = ut.get_time_signature(track,h5) timbre_means[idx],timbre_vars[idx], timbre_median[idx], timbre_min[idx], timbre_max[idx] = ut.get_timbre(track,h5) pitches_means[idx],pitches_vars[idx], pitches_median[idx], pitches_min[idx], pitches_max[idx]= ut.get_pitches(track,h5) energies[idx] = ut.get_energy_feature(track) h5.close() #use binning for continious data #problem: number of bins => freedman-driaconis rule num_bins = 2* (stats.scoreatpercentile(loudnesses_interval, 75) - stats.scoreatpercentile(loudnesses_interval, 25))*len(loudnesses_interval)**(1/3) bins = np.linspace(min(loudnesses_interval), max(loudnesses_interval),num=num_bins) d_loudnesses_interval = np.digitize(loudnesses_interval, bins) num_bins = 2* (stats.scoreatpercentile(loudnesses, 75) - stats.scoreatpercentile(loudnesses, 25))*len(loudnesses)**(1/3) bins = np.linspace(min(loudnesses), max(loudnesses),num=100) d_loudnesses = np.digitize(loudnesses, bins) num_bins = 2* (stats.scoreatpercentile(tempos, 75) - stats.scoreatpercentile(tempos, 25))*len(tempos)**(1/3) bins = np.linspace(min(tempos), max(tempos),num=100)
pitches_max = np.zeros((len(tracks), 12)) for idx, track in enumerate(tracks): h5 = GETTERS.open_h5_file_read( "../../msd_dense_subset/mood/" + track[2] + "/" + track[3] + "/" + track[4] + "/" + track + ".h5") #fetch h5 file to allow faster preprocessing keys[idx], modes[idx] = ut.get_key_feature(track, h5) loudnesses[idx], loudnesses_var[idx], loudnesses_interval[ idx] = ut.get_loudness(track, h5) tempos[idx] = ut.get_tempo_feature(track, h5) time_signatures[idx] = ut.get_time_signature(track, h5) timbre_means[idx], timbre_vars[idx], timbre_median[idx], timbre_min[ idx], timbre_max[idx] = ut.get_timbre(track, h5) pitches_means[idx], pitches_vars[idx], pitches_median[idx], pitches_min[ idx], pitches_max[idx] = ut.get_pitches(track, h5) energies[idx] = ut.get_energy_feature(track) h5.close() #use binning for continious data #problem: number of bins => freedman-driaconis rule num_bins = 2 * (stats.scoreatpercentile(loudnesses_interval, 75) - stats.scoreatpercentile(loudnesses_interval, 25) ) * len(loudnesses_interval)**(1 / 3) bins = np.linspace(min(loudnesses_interval), max(loudnesses_interval), num=num_bins) d_loudnesses_interval = np.digitize(loudnesses_interval, bins) num_bins = 2 * (stats.scoreatpercentile(loudnesses, 75) - stats.scoreatpercentile(loudnesses, 25)) * len(loudnesses)**(