def get_all_titles(basedir,targetdir,ext='.h5') :
    
    global cap
    
    decadecount = defaultdict(int)
    i = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            year = hdf5_getters.get_year(h5)
            print i
            i+=1
            if year == 0:
                h5.close()
                continue
            label = getbin(year)
            if decadecount[label] > cap:
                flag = checkforcompletion(decadecount)
                h5.close()
                if flag: # All the bins have exceeded their count. We can proceed with the training
                    return decadecount
                continue
            
            # Copy files
            copy(f,targetdir)
            
            decadecount[label] += 1
            h5.close()
    
    return decadecount
Example #2
0
def extract_features(filename):
    h5 = hdf5_getters.open_h5_file_read(filename)
    f = [None] * len(features)
    f[features.index('track_id')] = hdf5_getters.get_track_id(h5, 0).item()
    f[features.index('song_id')] = hdf5_getters.get_song_id(h5, 0).item()
    f[features.index('hotttnesss')] = hdf5_getters.get_artist_hotttnesss(
        h5, 0).item()
    f[features.index('danceability')] = hdf5_getters.get_danceability(
        h5, 0).item()
    f[features.index('duration')] = hdf5_getters.get_duration(h5, 0).item()
    f[features.index('key')] = hdf5_getters.get_key(h5, 0).item()
    f[features.index('energy')] = hdf5_getters.get_energy(h5, 0).item()
    f[features.index('loudness')] = hdf5_getters.get_loudness(h5, 0).item()
    f[features.index('year')] = hdf5_getters.get_year(h5, 0).item()
    f[features.index('time_signature')] = hdf5_getters.get_time_signature(
        h5, 0).item()
    f[features.index('tempo')] = hdf5_getters.get_tempo(h5, 0).item()
    tags = ''
    for tag in hdf5_getters.get_artist_terms(h5):
        tags += ('%s|' % tag)
    # Remove trailing pipe.
    tags = tags[:len(tags) - 1]
    f[features.index('tags')] = tags
    h5.close()
    return f
Example #3
0
def feat_from_file(path):
    
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    
    feats.append( GETTERS.get_track_id(h5) )
    feats.append( GETTERS.get_title(h5) )
    feats.append( GETTERS.get_artist_name(h5) )
    feats.append( GETTERS.get_year(h5) )
    feats.append( GETTERS.get_loudness(h5) )
    feats.append( GETTERS.get_tempo(h5) )
    feats.append( GETTERS.get_time_signature(h5) )
    feats.append( GETTERS.get_key(h5) )
    feats.append( GETTERS.get_mode(h5) )
    feats.append( GETTERS.get_duration(h5) )
    
    #timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre, axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre, axis=0)
    for k in var_timbre:
        feats.append(k)

    h5.close()
    
    return feats
Example #4
0
def func_to_extract_features(filename):
    """
    This function extracts all features: per-track, per-section and per-segment
    """
#    - open the song file
    h5 = GETTERS.open_h5_file_read(filename)
#    - get per-track features and put them

    artist_id = GETTERS.get_artist_id(h5)
    song_id   = GETTERS.get_song_id(h5)

    artist_familiarity          = GETTERS.get_artist_familiarity(h5)
    artist_hotttnesss           = GETTERS.get_artist_hotttnesss(h5)
    artist_latitude             = GETTERS.get_artist_latitude(h5)
    artist_longitude            = GETTERS.get_artist_longitude(h5)
    danceability                = GETTERS.get_danceability(h5)
    energy                      = GETTERS.get_energy(h5)
    loudness                    = GETTERS.get_loudness(h5)
    song_hotttnesss             = GETTERS.get_song_hotttnesss(h5)
    tempo                       = GETTERS.get_tempo(h5)
    year                        = GETTERS.get_year(h5)

#   artist_ids.add(artist_id)

#    features_tuple = (artist_id, artist_familiarity, artist_hotttnesss, artist_latitude, artist_longitude, danceability, energy, loudness, song_hotttnesss, tempo, year)
    features_tuple = (artist_id, artist_familiarity, artist_hotttnesss, loudness, song_hotttnesss, tempo, year)
 #   print features_tuple
    
    features_tuples[song_id] = features_tuple
    
#    files_per_artist[artist_id] += 1
#    - close the file
    h5.close()
def get_all_examples(basedir, genre_dict, ext='.h5'):
    """
    From a base directory, goes through all subdirectories,
    and grabs all songs and their features and puts them into a pandas dataframe 
    INPUT
       basedir    - base directory of the dataset
       genre_dict - a dictionary mapping track id to genre based tagraum dataset
       ext        - extension, .h5 by default
    RETURN
       dataframe containing all song examples
    """
    features_vs_genre = pd.DataFrame()

    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        # # count files
        # count += len(files)
        # apply function to all files
        for f in files:
            h5 = GETTERS.open_h5_file_read(f)
            song_id = GETTERS.get_track_id(h5).decode('utf-8')
            if (song_id in genre_dict):
                genre = genre_dict[song_id]
                year = GETTERS.get_year(h5)
                duration = GETTERS.get_duration(h5)
                end_of_fade_in = GETTERS.get_end_of_fade_in(h5)
                loudness = GETTERS.get_loudness(h5)
                song_hotttnesss = GETTERS.get_song_hotttnesss(h5)
                tempo = GETTERS.get_tempo(h5)
                key = GETTERS.get_key(h5)
                key_confidence = GETTERS.get_key_confidence(h5)
                mode = GETTERS.get_mode(h5)
                mode_confidence = GETTERS.get_mode_confidence(h5)
                time_signature = GETTERS.get_time_signature(h5)
                time_signature_confidence = GETTERS.get_time_signature_confidence(
                    h5)
                artist_name = GETTERS.get_artist_name(h5)
                title = GETTERS.get_title(h5)
                # length of sections_start array gives us number of start
                num_sections = len(GETTERS.get_sections_start(h5))
                num_segments = len(GETTERS.get_segments_confidence(h5))
                example = pd.DataFrame(
                    data=[
                        (artist_name, title, song_id, genre, year, key,
                         key_confidence, mode, mode_confidence, time_signature,
                         time_signature_confidence, duration, end_of_fade_in,
                         loudness, song_hotttnesss, tempo, num_sections)
                    ],
                    columns=[
                        'artist_name', 'title', 'song_id', 'genre', 'year',
                        'key', 'key_confidence', 'mode', 'mode_confidence',
                        'time_signature', 'time_signature_confidence',
                        'duration', 'end_of_fade_in', 'loudness',
                        'song_hotttnesss', 'tempo', 'num_segments'
                    ])
                features_vs_genre = features_vs_genre.append(example)
            h5.close()

    return features_vs_genre
Example #6
0
def func_to_desired_song_data(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    track_id = GETTERS.get_track_id(h5)
    for song in random_songs:
        if song[0] == track_id:
            print("FOUND ONE!")
            title = replace_characters(GETTERS.get_title(h5))
            artist = replace_characters(GETTERS.get_artist_name(h5))
            year = GETTERS.get_year(h5)
            tempo = GETTERS.get_tempo(h5)
            key = GETTERS.get_key(h5)
            loudness = GETTERS.get_loudness(h5)
            energy = GETTERS.get_energy(h5)
            danceability = GETTERS.get_danceability(h5)
            time_signature = GETTERS.get_time_signature(h5)
            mode = GETTERS.get_mode(h5)
            hotttness = GETTERS.get_song_hotttnesss(h5)

            song_data = {
                'title': title,
                'artist': artist,
                'year': year,
                'tempo': tempo,
                'key': key,
                'loudness': loudness,
                'energy': energy,
                'danceability': danceability,
                'time_signature': time_signature,
                'mode': mode,
                'hotttness': hotttness
            }

            all_the_data.append(song_data)

    h5.close()
def load_raw_data():
    years = []
    timbres = []
    pitches = []
    min_length = 10000
    num = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            h5 = getter.open_h5_file_read(f)
            num += 1
            print(num)
            try:
                year = getter.get_year(h5)
                if year != 0:
                    timbre = getter.get_segments_timbre(h5)
                    s = np.size(timbre, 0)
                    if s >= 100:
                        if s < min_length:
                            min_length = s
                        pitch = getter.get_segments_pitches(h5)
                        years.append(year)
                        timbres.append(timbre)
                        pitches.append(pitch)
            except:
                print(1)
            h5.close()
    return years, timbres, pitches, min_length
Example #8
0
def parse_songs(directory):
    global count
    global MAX_SONGS
    for filename in os.listdir(directory):
        if count >= MAX_SONGS: return
        file_path = os.path.join(directory, filename)
        if os.path.isdir(file_path):
            parse_songs(file_path)
        else:
            count += 1
            if count % 100 == 0: print('Parsed ' + str(count) + ' songs')

            with hdf5_getters.open_h5_file_read(file_path) as h5:
                for i in range(hdf5_getters.get_num_songs(h5)):
                    title = hdf5_getters.get_title(h5, i).decode('UTF-8')
                    year = hdf5_getters.get_year(h5, i).item()
                    danceability = hdf5_getters.get_danceability(h5, i).item()
                    tags = hdf5_getters.get_artist_mbtags(h5, i).tolist()
                    genres = [tag.decode('UTF-8') for tag in tags]
                    tempo = hdf5_getters.get_tempo(h5, i).item()

                    song = {
                        'title': title,
                        'year': year,
                        'danceability': danceability,
                        'genres': genres,
                        'tempo': tempo
                    }
                    song = os.path.splitext(filename)
                    with open(
                            "/home/ubuntu/million_songs/parsed_data/" +
                            song[0] + '.json', 'w') as fp:
                        json.dump(song, fp)
def load_non_time_data():
    years = []
    ten_features=[]
    num = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = getter.open_h5_file_read(f)
            num += 1
            print(num)
            try:
                year = getter.get_year(h5)
                if year!=0:
                    years.append(year)
                    title_length = len(getter.get_title(h5))
                    terms_length = len(getter.get_artist_terms(h5))
                    tags_length = len(getter.get_artist_mbtags(h5))
                    hotness = getter.get_artist_hotttnesss(h5)
                    duration = getter.get_duration(h5)
                    loudness = getter.get_loudness(h5)
                    mode = getter.get_mode(h5)
                    release_length = len(getter.get_release(h5))
                    tempo = getter.get_tempo(h5)
                    name_length = len(getter.get_artist_name(h5))
                    ten_feature = np.hstack([title_length,tags_length, hotness, duration,
                                             terms_length, loudness, mode, release_length, tempo, name_length])
                    ten_features.append(ten_feature) 
            except:
                print(1)
            h5.close()
    return years,ten_features
def create_songdet(h5, sngidxfle):
    '''
    Collects song details for all unique songs heard by 100 raters.
    Format of dictionary: { SongID : [ Att_0, Att_1, Att_2 ] }
    '''
    import hdf5_getters
    sngdetfle = open("songdet.txt", "wb")
    sngdetdic = dict()
    sngidxfle = open(sngidxfle, "rb")
    sngidxdic = pickle.load(sngidxfle)
    for elem in sngidxdic:
        songidx = sngidxdic[elem]
        tempo = hdf5_getters.get_tempo(h5, songidx)
        loud = hdf5_getters.get_loudness(h5, songidx)
        year = hdf5_getters.get_year(h5, songidx)
        tmsig = hdf5_getters.get_time_signature(h5, songidx)
        key = hdf5_getters.get_key(h5, songidx)
        mode = hdf5_getters.get_mode(h5, songidx)
        duration = hdf5_getters.get_duration(h5, songidx)
        fadein = hdf5_getters.get_end_of_fade_in(h5, songidx)
        fadeout = hdf5_getters.get_start_of_fade_out(h5, songidx)
        artfam = hdf5_getters.get_artist_familiarity(h5, songidx)
        sngdetdic[elem] = [duration, tmsig, tempo,
                           key, mode, fadein, fadeout, year, loud, artfam]
    pickle.dump(sngdetdic, sngdetfle)
    sngdetfle.close()
Example #11
0
def getSamples(basedir):
    X, Y = [], []
    feature_labels = [
        'segments_pitch', 'segments_timbre', 'segments_loudness_max', 'tempo'
    ]
    cnt = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*.h5'))
        print(root, cnt)
        # apply function to all files
        for f in files:
            #             try:
            h5 = GETTERS.open_h5_file_read(f)
            year = GETTERS.get_year(h5)
            if year == 0:
                continue
            bttimbre = get_bttimbre(h5)
            if bttimbre is None or year == 0:
                continue
            h5.close()
            X.append(bttimbre)
            Y.append(year)
            cnt += 1

            if cnt > 10000:
                break
    return X, Y, feature_labels
Example #12
0
    def process_song(self, song_path):

        song_data = h5.open_h5_file_read(song_path)

        song_id = h5.get_song_id(song_data).decode('UTF-8')
        song_int_id = int(h5.get_track_7digitalid(song_data))
        song_name = h5.get_title(song_data).decode('UTF-8').lower()
        artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower()
        song_year = int(h5.get_year(song_data))

        timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data))
        chroma = self.ndarray_list_to_ndlist(
            h5.get_segments_pitches(song_data))

        song_data.close()
        song_dict = {
            'id': song_int_id,
            'source_id': song_id,
            'name': song_name,
            'artist': artist_name,
            'year': song_year,
            'timbre': timbre,
            'chroma': chroma
        }
        return song_dict
def create_songdet(h5, sngidxfle):
    '''
    Collects song details for all unique songs heard by 100 raters.
    Format of dictionary: { SongID : [ Att_0, Att_1, Att_2 ] }
    '''
    import hdf5_getters
    sngdetfle = open("songdet.txt", "wb")
    sngdetdic = dict()
    sngidxfle = open(sngidxfle, "rb")
    sngidxdic = pickle.load(sngidxfle)
    for elem in sngidxdic:
        songidx = sngidxdic[elem]
        tempo = hdf5_getters.get_tempo(h5, songidx)
        loud = hdf5_getters.get_loudness(h5, songidx)
        year = hdf5_getters.get_year(h5, songidx)
        tmsig = hdf5_getters.get_time_signature(h5, songidx)
        key = hdf5_getters.get_key(h5, songidx)
        mode = hdf5_getters.get_mode(h5, songidx)
        duration = hdf5_getters.get_duration(h5, songidx)
        fadein = hdf5_getters.get_end_of_fade_in(h5, songidx)
        fadeout = hdf5_getters.get_start_of_fade_out(h5, songidx)
        artfam = hdf5_getters.get_artist_familiarity(h5, songidx)
        sngdetdic[elem] = [
            duration, tmsig, tempo, key, mode, fadein, fadeout, year, loud,
            artfam
        ]
    pickle.dump(sngdetdic, sngdetfle)
    sngdetfle.close()
Example #14
0
    def process_song(self, song_path):
        # read file
        song_data = h5.open_h5_file_read(song_path)

	# process file
        #song_id = h5.get_song_id(song_data).decode('UTF-8')
        song_int_id = int(h5.get_track_7digitalid(song_data))
        song_name = h5.get_title(song_data).decode('UTF-8').lower()
        artist_name = h5.get_artist_name(song_data).decode('UTF-8').lower()
        song_year = int(h5.get_year(song_data))

        sp = SpotifyInterface()
        track_info = sp.search_track_info(artist_name, song_name)

        if track_info == None:
            song_data.close()
            return None

        timbre = self.ndarray_list_to_ndlist(h5.get_segments_timbre(song_data))
        chroma = self.ndarray_list_to_ndlist(h5.get_segments_pitches(song_data))

        song_data.close()

        song_dict = {'id': song_int_id, 'name': song_name, 
                    'artist': artist_name, 'year': song_year, 'timbre': timbre, 
                    'chroma': chroma, **track_info}

        return song_dict
def buildfeatures(basedir,cluster,ext='.h5'):
    
    global cap
    i = 0
    features = []
    decade = []
    decadecount = defaultdict(int)
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            year = hdf5_getters.get_year(h5)
            if year == 0:
                h5.close()
                continue
            bins = getbin(year)
#            dec = (year/10)*10
#            if dec < 1960:
#                h5.close()
#                continue
            if decadecount[bins] > cap:
                flag = checkforcompletion(decadecount)
                h5.close()
                if flag:
                    return features,decade
                continue
            i += 1
            print i 
            clustercount = {}
          
            for x in range(50):
                clustercount[x] = 0
               
            feature = []
           
            try:
                bttimbre = bt.get_bttimbre(h5)
                btT = bttimbre.T
                for x in btT:
                    label = kmeans.predict(x)
                    clustercount[label[0]] += 1
                for cl in clustercount.keys():
                    feature.append(clustercount[cl])

                features.append(feature)
                
                decade.append(bins)
                decadecount[bins] += 1
            
            except:
                h5.close()
                continue
            h5.close()
            
  
    return features,decade
def main():
    # print("we in")
    outputFile1 = open('../Datasets/MSDSubsetCSV.csv', 'w')
    csvRowString = ""

    csvRowString = "Title,ArtistName"
    csvAttributeList = re.split(',', csvRowString)
    for i, v in enumerate(csvAttributeList):
        csvAttributeList[i] = csvAttributeList[i].lower()
    csvRowString += ",\n"

    basedir = '/Users/Owner/Desktop/School/2019-2020/COMP400/MillionSongSubset/'
    ext = ".h5"

    #FOR LOOP
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print(f)
            songH5File = hdf5_getters.open_h5_file_read(f)
            song = Song(str(hdf5_getters.get_song_id(songH5File)))

            song.title = str(hdf5_getters.get_title(songH5File)).replace(
                "b'", "").lower()
            song.artistName = str(
                hdf5_getters.get_artist_name(songH5File)).replace("b'",
                                                                  "").lower()
            song.year = str(hdf5_getters.get_year(songH5File))
            if (int(song.year) < 1990):
                print('nope', int(song.year))
                continue

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"

                if attribute == 'ArtistName'.lower():
                    csvRowString += "\"" + song.artistName.replace(
                        "'", "") + "\""  #took out   "\"" before and after
                elif attribute == 'Title'.lower():
                    csvRowString += "\"" + song.title.replace("'", "") + "\""
                else:
                    csvRowString += "Erm. This didn't work. Error. :( :(\n"

                csvRowString += ","

            #Remove the final comma from each row in the csv
            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex - 1]
            csvRowString += "\n"
            outputFile1.write(csvRowString)
            csvRowString = ""

            songH5File.close()

    outputFile1.close()
def getclusters(basedir,ext='.h5') :
    print 'inside clusters'
    features = []
    cfeatures = [] 
    decadecount = defaultdict(int)
    deccount = defaultdict(int)
    i=0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            year = hdf5_getters.get_year(h5)
            if year == 0:
                h5.close()
                continue
            bins = getbin(year)
#            decade.append(bin)
#            dec = (year/10)*10
#            decadecount[dec] += 1
#            if dec < 1960:
#                h5.close()
#                continue
             
            deccount[bin] += 1
            if decadecount[bins] > cap:
                flag = checkforcompletion(decadecount)
                h5.close()
                if flag:
                    for dec in decadecount.keys():
                        print 'Dec : ' + str(dec) + ' Count : ' + str(decadecount[dec])
                    return features,cfeatures
                continue
            i += 1
            print i
            try:
                bttimbre = bt.get_bttimbre(h5)
                btT = bttimbre.T
                for x in btT:
                    features.append(x)
                decadecount[bins] += 1
                btchroma = bt.get_btchromas(h5)
                btc = btchroma.T
                for x in btc:
                    cfeatures.append(x)
                decadecount[bins] += 1
            except:
                h5.close()
                continue
            h5.close()
    for dec in deccount.keys():
        print 'Dec : ' + str(dec) + ' Count : ' + str(decadecount[dec])
    features = array(features)
    cfeatures = array(cfeatures)
    return features,cfeatures
Example #18
0
def songinfo(if_str):
	songs_tracks = pickle.load(open ("../../msd_dense_subset/dense/songs_tracks.pkl",'r'));
	track = str(songs_tracks[if_str])
	# build path
	path = "../../msd_dense_subset/dense/"+track[2]+"/"+track[3]+"/"+track[4]+"/"+track+".h5"
	h5 = GETTERS.open_h5_file_read(path)
	artist_name = GETTERS.get_artist_name(h5)
	song_name = GETTERS.get_title(h5)
	year = GETTERS.get_year(h5, 0)
	#segments = GETTERS.get_segments_start(h5, 0);
	#segments_pitches = GETTERS.get_segments_pitches(h5, 0)
	h5.close()
	return artist_name+ " - " +song_name + " (" +str(year) +")"
Example #19
0
def _extractSongData(file_path, filename):
    # song_id, title, release, artist_name, year
    h5 = hdf5_getters.open_h5_file_read(file_path)
    track_id = filename[:-3]
    song_id = hdf5_getters.get_song_id(h5).decode('UTF-8')
    dig7_id = hdf5_getters.get_track_7digitalid(h5)
    title = hdf5_getters.get_title(h5).decode('UTF-8')
    release = hdf5_getters.get_release(h5).decode('UTF-8')
    artist_name = hdf5_getters.get_artist_name(h5).decode('UTF-8')
    year = hdf5_getters.get_year(h5)
    h5.close()
    # print(song_id, track_id, dig7_id, title, release, artist_name, year)
    return track_id, song_id, dig7_id, title, release, artist_name, year
def extract_features(songlist,outputf):
    """
    Extract features from a list of songs, save them in a give filename
    in MLcomp ready format
    INPUT
        songlist   - arrays of path to HDF5 song files
         outputf   - filename (text file)
    """
    # sanity check
    if os.path.isfile(outputf):
        print 'ERROR:',outputf,'already exists.'
        sys.exit(0)
    # open file
    output = open(outputf,'w')
    # iterate ofer songs
    cnt = 0
    for f in songlist:
        # counter
        cnt += 1
        if cnt % 50000 == 0:
            print 'DOING FILE',cnt,'/',len(songlist)
        # extract info
        h5 = GETTERS.open_h5_file_read(f)
        timbres = GETTERS.get_segments_timbre(h5).T
        year = GETTERS.get_year(h5)
        h5.close()
        # sanity checks
        if year <= 0:
            continue
        if timbres.shape[1] == 0 or timbres.shape[0] == 0:
            continue
        if timbres.shape[1] < 10:
            continue # we can save some space from bad examples?
        # features
        avg = np.average(timbres,1)
        cov = np.cov(timbres)
        covflat = []
        for k in range(12):
            covflat.extend( np.diag(cov,k) )
        covflat = np.array(covflat)
        feats = np.concatenate([avg,covflat])
        # sanity check NaN and INF
        if np.isnan(feats).any() or np.isinf(feats).any():
            continue
        # all good? write to file
        output.write(str(convert_year(year))+' |avgcov')
        for k in range(90):
            output.write(' '+str(k+1)+':%.4f' % feats[k])
        output.write('\n')
    # close file
    output.close()
def extract_features(songlist, outputf):
    """
    Extract features from a list of songs, save them in a give filename
    in MLcomp ready format
    INPUT
        songlist   - arrays of path to HDF5 song files
         outputf   - filename (text file)
    """
    # sanity check
    if os.path.isfile(outputf):
        print 'ERROR:', outputf, 'already exists.'
        sys.exit(0)
    # open file
    output = open(outputf, 'w')
    # iterate ofer songs
    cnt = 0
    for f in songlist:
        # counter
        cnt += 1
        if cnt % 50000 == 0:
            print 'DOING FILE', cnt, '/', len(songlist)
        # extract info
        h5 = GETTERS.open_h5_file_read(f)
        timbres = GETTERS.get_segments_timbre(h5).T
        year = GETTERS.get_year(h5)
        h5.close()
        # sanity checks
        if year <= 0:
            continue
        if timbres.shape[1] == 0 or timbres.shape[0] == 0:
            continue
        if timbres.shape[1] < 10:
            continue  # we can save some space from bad examples?
        # features
        avg = np.average(timbres, 1)
        cov = np.cov(timbres)
        covflat = []
        for k in range(12):
            covflat.extend(np.diag(cov, k))
        covflat = np.array(covflat)
        feats = np.concatenate([avg, covflat])
        # sanity check NaN and INF
        if np.isnan(feats).any() or np.isinf(feats).any():
            continue
        # all good? write to file
        output.write(str(convert_year(year)) + ' |avgcov')
        for k in range(90):
            output.write(' ' + str(k + 1) + ':%.4f' % feats[k])
        output.write('\n')
    # close file
    output.close()
def insert_song():
    print('Inserting song tuples')
    conn = get_conn()
    cursor = get_cursor(conn)

    __id = None
    __title = None
    __avg_rate = None
    __release_date = None
    __duration = None
    __price = None
    __provider_name = None
    __genre_id = None
    __singer_id = None
    __download = None

    try:
        for i in range(hard.NUM_SONGS):
            __id = bytes2str(GETTERS.get_song_id(h5, i))
            __title = bytes2str(GETTERS.get_title(h5, i))
            __avg_rate = 0.0
            # use int() to transform the numpy.int32 to int which is supported by Oracle
            __release_date = int(GETTERS.get_year(h5, i))
            if __release_date == 0:
                __release_date = None
            __duration = int(GETTERS.get_duration(h5, i))
            __price = InfoGenerator.gen_price()
            __provider_name = InfoGenerator.get_provider_name()
            __genre_id = InfoGenerator.get_genre_id()
            __singer_id = bytes2str(GETTERS.get_artist_id(h5, i))
            __download = 0

            cursor.execute(sql.INSERT_SONG,
                    id = __id, title = __title, avg_rate = __avg_rate, release_date = __release_date,
                    duration = __duration, price = __price, provider_name = __provider_name,
                    genre_id = __genre_id, singer_id = __singer_id, download = __download)
            songs[i] = Song(__id, __title, __avg_rate, __release_date, __duration, __price,
                               __provider_name, __genre_id, __singer_id, __download)
        return 0
    except Exception as e:
        print(e)
        print('i:', i, '\nid:',__id, '\ntitle:', __title, '\navg_rate:', __avg_rate,
              '\nrelease_date:', __release_date, '\nduration', __duration, '\nprice', __price,
              'provider_name:',__provider_name, '\ngenre_id:', __genre_id, '\nsinger_id', __singer_id,
              'download:',__download)
        return -1
    finally:
        conn.commit()
        close_all(conn, cursor)
Example #23
0
def songinfo(if_str):
    songs_tracks = pickle.load(
        open("../../msd_dense_subset/dense/songs_tracks.pkl", 'r'))
    track = str(songs_tracks[if_str])
    # build path
    path = "../../msd_dense_subset/dense/" + track[2] + "/" + track[
        3] + "/" + track[4] + "/" + track + ".h5"
    h5 = GETTERS.open_h5_file_read(path)
    artist_name = GETTERS.get_artist_name(h5)
    song_name = GETTERS.get_title(h5)
    year = GETTERS.get_year(h5, 0)
    #segments = GETTERS.get_segments_start(h5, 0);
    #segments_pitches = GETTERS.get_segments_pitches(h5, 0)
    h5.close()
    return artist_name + " - " + song_name + " (" + str(year) + ")"
Example #24
0
def process_song(h5_song_file):
	song = {}
	song['artist_familiarity'] = hdf5_getters.get_artist_familiarity(h5)
	song['artist_id'] = hdf5_getters.get_artist_id(h5)
	song['artist_name'] = hdf5_getters.get_artist_name(h5)
	song['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss(h5);
	song['title'] = hdf5_getters.get_title(h5)
	terms = hdf5_getters.get_artist_terms(h5)
	terms_freq = hdf5_getters.get_artist_terms_freq(h5)
	terms_weight = hdf5_getters.get_artist_terms_weight(h5)
	terms_array = []
	# Creating a array of [term, its frequency, its weight]. Doing this for all terms associated
	# with the artist
	for i in range(len(terms)):
		terms_array.append([terms[i], terms_freq[i], terms_weight[i]])	
		
	song['artist_terms'] = terms_array
	beats_start = hdf5_getters.get_beats_start(h5)
	song['beats_start_variance'] = variance(beats_start)   #beats variance in yocto seconds(10^-24s)
	song['number_of_beats'] = len(beats_start)
	song['duration'] = hdf5_getters.get_duration(h5)
	song['loudness'] = hdf5_getters.get_loudness(h5)
	sections_start = hdf5_getters.get_sections_start(h5)
	song['sections_start_variance'] = variance(sections_start)
	song['number_of_sections'] = len(sections_start)
	
	segments_pitches = hdf5_getters.get_segments_pitches(h5)
	(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_pitches)
	song['segments_pitches_variance'] = [variance(a0), variance(a1), variance(a2),
					variance(a3), variance(a4), variance(a5), variance(a6), variance(a7),
					variance(a8), variance(a9), variance(a10), variance(a11)]
	song['segments_pitches_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), 
					mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)]
	
	segments_timbre = hdf5_getters.get_segments_timbre(h5)
	(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) = split_segments(segments_timbre)
	song['segments_timbre_variance'] = [variance(a0), variance(a1), variance(a2),
					variance(a3), variance(a4), variance(a5), variance(a6), variance(a7),
					variance(a8), variance(a9), variance(a10), variance(a11)]
	song['segments_timbre_mean'] = [mean(a0), mean(a1), mean(a2), mean(a3), mean(a4), 
					mean(a5), mean(a6), mean(a7), mean(a8), mean(a9), mean(a10), mean(a11)]
	song['tempo'] = hdf5_getters.get_tempo(h5)
	song['_id'] = hdf5_getters.get_song_id(h5)
	song['year'] = hdf5_getters.get_year(h5)	
	return song
Example #25
0
def validate_song(h5_file,song):
	'''Returns true/false if song is valid or not. This is essentially cleanup and only lets through songs
	which have 'good' data (have a non-negligible duration, and have segments being most important)'''
	try:
		assert gt.get_year(h5_file, song)!='0'
		assert gt.get_duration(h5_file, song)>60.0
		assert gt.get_mode_confidence(h5_file, song)>0.2
		assert gt.get_key_confidence(h5_file, song)>0.2
		assert gt.get_time_signature_confidence(h5_file, song)>0.2
		terms=np.array(gt.get_artist_terms(h5_file, song))
		assert terms.size>0
		segments=np.array(gt.get_segments_start)
		assert segments.size>0
		sections=np.array(gt.get_sections_start)
		assert sections.size>0
	except:
		return False
	return True
def load_raw_data():
    years = []
    ten_features=[]
    timbres = []
    pitches = []
    min_length = 10000
    num = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = getter.open_h5_file_read(f)
            num += 1
            print(num)
            try:
                year = getter.get_year(h5)
                if year!=0:
                    timbre = getter.get_segments_timbre(h5)
                    s = np.size(timbre,0)
                    if s>=100:
                        if s<min_length:
                            min_length = s
                        pitch = getter.get_segments_pitches(h5)
                        years.append(year)
                        timbres.append(timbre)
                        pitches.append(pitch)
                        title_length = len(getter.get_title(h5))
                        terms_length = len(getter.get_artist_terms(h5))
                        tags_length = len(getter.get_artist_mbtags(h5))
                        hotness = getter.get_artist_hotttnesss(h5)
                        duration = getter.get_duration(h5)
                        loudness = getter.get_loudness(h5)
                        mode = getter.get_mode(h5)
                        release_length = len(getter.get_release(h5))
                        tempo = getter.get_tempo(h5)
                        name_length = len(getter.get_artist_name(h5))
                        ten_feature = np.hstack([title_length, hotness, duration, tags_length,
                                                 terms_length,loudness, mode, release_length, tempo, name_length])

                        ten_features.append(ten_feature) 
            except:
                print(1)
            h5.close()
    return years, timbres, pitches,min_length,ten_features
Example #27
0
def h5_to_csv_fields(h5,song):
	'''Converts h5 format to text
		Inputs: h5, an h5 file object, usable with the wrapper code MSongsDB
			song, an integer, representing which song in the h5 file to take the info out of (h5 files contain many songs)
		Output: a string representing all the information of this song, as a single line of a csv file
	'''
	rv=[]
	##All these are regular getter functions from wrapper code
	rv.append(gt.get_artist_name(h5,song))
	rv.append(gt.get_title(h5, song))
	rv.append(gt.get_release(h5, song))
	rv.append(gt.get_year(h5,song))
	rv.append(gt.get_duration(h5,song))
	rv.append(gt.get_artist_familiarity(h5,song))
	rv.append(gt.get_artist_hotttnesss(h5,song))
	rv.append(gt.get_song_hotttnesss(h5, song))
	
	##artist_terms, artist_terms_freq, and artist_terms_weight getter functions
	##are all arrays, so we need to turn them into strings first. We used '_' as a separator
	rv.append(array_to_csv_field(list(gt.get_artist_terms(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_artist_terms_freq(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_artist_terms_weight(h5,song))))
	rv.append(gt.get_mode(h5,song))
	rv.append(gt.get_key(h5,song))
	rv.append(gt.get_tempo(h5,song))
	rv.append(gt.get_loudness(h5,song))
	rv.append(gt.get_danceability(h5,song))
	rv.append(gt.get_energy(h5,song))
	rv.append(gt.get_time_signature(h5,song))
	rv.append(array_to_csv_field(list(gt.get_segments_start(h5,song))))
	##These arrays have vectors (Arrays) as items, 12 dimensional each
	##An array like [[1,2,3],[4,5,6]] will be written to csv as '1;2;3_4;5;6', i.e. there's two types of separators
	rv.append(double_Array_to_csv_field(list(gt.get_segments_timbre(h5,song)),'_',';'))
	rv.append(double_Array_to_csv_field(list(gt.get_segments_pitches(h5,song)),'_',';'))
	rv.append(array_to_csv_field(list(gt.get_segments_loudness_start(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_segments_loudness_max(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_segments_loudness_max_time(h5,song))))
	rv.append(array_to_csv_field(list(gt.get_sections_start(h5,song))))
	##turn this list into a string with comma separators (i.e. a csv line)
	rv_string=array_to_csv_field(rv, ",")
	rv_string+="\n"
	return rv_string
Example #28
0
def getTrackInfo(starting_num):
    my_list = []
    f = hdf5_getters.open_h5_file_read(filepath)
    progress_bar = tqdm(range(tracks_per_thread))
    for iteration in progress_bar:
        i = int(iteration) + (starting_num*tracks_per_thread)
        track_id = hdf5_getters.get_track_id(f, i).decode()
        if track_id not in lyric_track_ids_set:
            continue # skip it an go on
        artist_name = hdf5_getters.get_artist_name(f, i).decode()
        duration = hdf5_getters.get_duration(f, i)
        loudness = hdf5_getters.get_loudness(f, i)
        tempo = hdf5_getters.get_tempo(f, i)
        title = hdf5_getters.get_title(f, i).decode()
        year = hdf5_getters.get_year(f, i)
        long_list = [track_id, artist_name, duration, loudness, tempo, title, year]
        my_list.append(long_list)
        progress_bar.set_description("Iteration %d" % i)
    f.close()
    return my_list
def func_to_extract_year(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    global cntnan	
    global listloudness
    h5 = GETTERS.open_h5_file_read(filename)
    nanfound = 0

    #Get target feature: song loudness 
    song_year = GETTERS.get_year(h5)
    if song_year == 0:
       nanfound = 1
       cntnan = cntnan + 1
    else:
       listyear.append(song_year)

    h5.close()
def get_all_data(target, basedir, ext='.h5') :

    # header
    target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                 "track_id", "song_id", "title", "artist_name", "artist_location",
                 "artist_hotttnesss", "release", "year", "song_hotttnesss",
                 "danceability", "duration", "loudness", "sample_rate", "tempo"
    ))

    count = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            for line in f:
                new_file = open("tmp.txt", 'w')
                new_file.write(line)

                h5 = hdf5_getters.open_h5_file_read(new_file)
                target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                              hdf5_getters.get_track_id(h5),
                              hdf5_getters.get_song_id(h5),
                              hdf5_getters.get_title(h5),
                              hdf5_getters.get_artist_name(h5),
                              hdf5_getters.get_artist_location(h5),
                              hdf5_getters.get_artist_hotttnesss(h5),
                              hdf5_getters.get_release(h5),
                              hdf5_getters.get_year(h5),
                              hdf5_getters.get_song_hotttnesss(h5),
                              hdf5_getters.get_danceability(h5),
                              hdf5_getters.get_duration(h5),
                              hdf5_getters.get_loudness(h5),
                              hdf5_getters.get_analysis_sample_rate(h5),
                              hdf5_getters.get_tempo(h5)
                ))

                # show progress
                count += 1
                print "%d/10000" % (count)

                h5.close()
Example #31
0
def get_all_attributes(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get all required attributes
    - write it to a csv file 
    - close the files
    """
    with open('attributes.csv', 'a') as csvfile:
        try:
            # let's apply the previous function to all files
            csvwriter = csv.writer(csvfile, delimiter='\t')
            h5 = GETTERS.open_h5_file_read(filename)
            RESULTS = []
            RESULTS.append(GETTERS.get_year(h5))
            RESULTS.append(GETTERS.get_artist_id(h5))
            RESULTS.append(GETTERS.get_artist_name(h5))
            RESULTS.append(GETTERS.get_artist_mbid(h5))
            RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5)))
            RESULTS.append(GETTERS.get_artist_hotttnesss(h5))
            RESULTS.append(GETTERS.get_artist_latitude(h5))
            RESULTS.append(GETTERS.get_artist_longitude(h5))
            RESULTS.append(GETTERS.get_artist_familiarity(h5))
            RESULTS.append(GETTERS.get_danceability(h5))
            RESULTS.append(GETTERS.get_duration(h5))
            RESULTS.append(GETTERS.get_energy(h5))
            RESULTS.append(GETTERS.get_loudness(h5))
            RESULTS.append(GETTERS.get_song_hotttnesss(h5))
            RESULTS.append(GETTERS.get_song_id(h5))
            RESULTS.append(GETTERS.get_tempo(h5))
            RESULTS.append(GETTERS.get_time_signature(h5))
            RESULTS.append(GETTERS.get_title(h5))
            RESULTS.append(GETTERS.get_track_id(h5))
            RESULTS.append(GETTERS.get_release(h5))
            csvwriter.writerow(RESULTS)
            h5.close()
        except AttributeError:
            pass
def get_feats(h5):
    f = []
    f.append(hdf5_getters.get_artist_name(h5).decode('utf8').replace(',', ''))
    f.append(hdf5_getters.get_title(h5).decode('utf8').replace(',', ''))
    f.append(str(hdf5_getters.get_loudness(h5)))
    f.append(str(hdf5_getters.get_tempo(h5)))
    f.append(str(hdf5_getters.get_time_signature(h5)))
    f.append(str(hdf5_getters.get_key(h5)))
    f.append(str(hdf5_getters.get_mode(h5)))
    f.append(str(hdf5_getters.get_duration(h5)))
    f.extend(get_statistical_feats(hdf5_getters.get_segments_timbre(h5)))
    f.extend(get_statistical_feats(hdf5_getters.get_segments_pitches(h5)))
    f.extend(get_statistical_feats(hdf5_getters.get_segments_loudness_max(h5)))
    f.extend(
        get_statistical_feats(hdf5_getters.get_segments_loudness_max_time(h5)))
    f.extend(
        get_statistical_feats(hdf5_getters.get_segments_loudness_start(h5)))
    f.append(str(hdf5_getters.get_song_hotttnesss(h5)))
    f.append(str(hdf5_getters.get_danceability(h5)))
    f.append(str(hdf5_getters.get_end_of_fade_in(h5)))
    f.append(str(hdf5_getters.get_energy(h5)))
    f.append(str(hdf5_getters.get_start_of_fade_out(h5)))
    f.append(str(hdf5_getters.get_year(h5)))
    return f
Example #33
0
def parse_aggregate_songs(file_name,file_name2,artist_map):
    """
    Given an aggregate filename and artist_map in the format
    {artist_name: {data pertaining to artist}}
    """
    """
    TODO: 
    -this function goes through each song, if artist not in there,
    add all data necesary and add first song info.
    else update any specific song info

    -song info is a map from attributename:[values]
    """
    #artist_map = {}
    h5 = hdf5_getters.open_h5_file_read(file_name)
    numSongs = hdf5_getters.get_num_songs(h5)
    print 'Parsing song file...'
    for i in range(numSongs):
        artist_name = hdf5_getters.get_artist_name(h5,i)

        #Filter location
        longi = hdf5_getters.get_artist_longitude(h5,i)
        lat = hdf5_getters.get_artist_latitude(h5,i)
        loc = hdf5_getters.get_artist_location(h5,i)
        if math.isnan(lat) or math.isnan(longi):
            #skip if no location
            continue

        #filter year
        yr = hdf5_getters.get_year(h5,i)
        if yr == 0:
            #skip if no year
            continue

        #filter hotttness and familiarity
        familiarity = hdf5_getters.get_artist_familiarity(h5,i)
        hotttness = hdf5_getters.get_artist_hotttnesss(h5,i)
        if familiarity<=0.0 or hotttness<=0.0:
            #skip if no hotttness or familiarity computations
            continue

        #TODO:MAYBE filter on dance and energy
        timbre = hdf5_getters.get_segments_timbre(h5,i)
        #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each
        if not artist_name in artist_map:
            #have not encountered the artist yet, so populate new map
            sub_map = {}
            sub_map['artist_familiarity'] = familiarity
            sub_map['artist_hotttnesss'] = hotttness
            sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i)
            #longi = hdf5_getters.get_artist_longitude(h5,i)
            #lat = hdf5_getters.get_artist_latitude(h5,i)
            #longi = None if math.isnan(longi) else longi
            #lat = None if math.isnan(lat) else lat
            sub_map['artist_latitude'] = lat
            sub_map['artist_longitude'] = longi
            sub_map['artist_location'] = loc
            sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i)
            #TODO:see if should weight by freq or weight for if the term matches one of the feature terms
            sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i))
            sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i))

            #song-sepcific data
            #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA:
            #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            sub_map['danceability'] = [dance]
            sub_map['duration'] = [hdf5_getters.get_duration(h5,i)]
            sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)]
            sub_map['energy'] = [energy]
            #since each song has a key, ask if feature for keys should be num of songs that appear in that key or
            #just binary if any of their songs has that key or just be avg of songs with that key
            #same for mode, since its either major or minor...should it be count or avg.?
            sub_map['key'] = [hdf5_getters.get_key(h5,i)]
            sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)]
            sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            sub_map['song_hotttnesss'] = [s_hot]
            sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)]
            sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)]
            #should time signature be count as well? binary?
            sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)]
            sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)]
            #should year be binary since they can have many songs across years and should it be year:count
            sub_map['year'] = [yr]

            artist_map[artist_name] = sub_map
        else:
            #artist already exists, so get its map and update song fields
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            artist_map[artist_name]['danceability'].append(dance)
            artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i))
            artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i))
            artist_map[artist_name]['energy'].append(energy)
            artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i))
            artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i))
            artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            artist_map[artist_name]['song_hotttnesss'].append(s_hot)
            artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i))
            artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i))
            #should time signature be count as well? binary?
            artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i))
            artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i))
            #should year be binary since they can have many songs across years and should it be year:count
            artist_map[artist_name]['year'].append(yr)

    h5 = hdf5_getters.open_h5_file_read(file_name2)
    numSongs = hdf5_getters.get_num_songs(h5)
    print 'Parsing song file2...'
    for i in range(numSongs):
        song_id = hdf5_getters.get_track_id(h5,i)
        artist_name = hdf5_getters.get_artist_name(h5,i)
        if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']:
            continue

        #Filter location
        longi = hdf5_getters.get_artist_longitude(h5,i)
        lat = hdf5_getters.get_artist_latitude(h5,i)
        loc = hdf5_getters.get_artist_location(h5,i)
        if math.isnan(lat) or math.isnan(longi):
            #skip if no location
            continue

        #filter year
        yr = hdf5_getters.get_year(h5,i)
        if yr == 0:
            #skip if no year
            continue

        #filter hotttness and familiarity
        familiarity = hdf5_getters.get_artist_familiarity(h5,i)
        hotttness = hdf5_getters.get_artist_hotttnesss(h5,i)
        if familiarity<=0.0 or hotttness<=0.0:
            #skip if no hotttness or familiarity computations
            continue

        #TODO:MAYBE filter on dance and energy
        timbre = hdf5_getters.get_segments_timbre(h5,i)
        #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each
        if not artist_name in artist_map:
            #have not encountered the artist yet, so populate new map
            sub_map = {}
            sub_map['artist_familiarity'] = familiarity
            sub_map['artist_hotttnesss'] = hotttness
            sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i)
            #longi = hdf5_getters.get_artist_longitude(h5,i)
            #lat = hdf5_getters.get_artist_latitude(h5,i)
            #longi = None if math.isnan(longi) else longi
            #lat = None if math.isnan(lat) else lat
            sub_map['artist_latitude'] = lat
            sub_map['artist_longitude'] = longi
            sub_map['artist_location'] = loc
            sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i)
            #TODO:see if should weight by freq or weight for if the term matches one of the feature terms
            sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i))
            sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i))

            #song-sepcific data
            #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA:
            #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            sub_map['danceability'] = [dance]
            sub_map['duration'] = [hdf5_getters.get_duration(h5,i)]
            sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)]
            sub_map['energy'] = [energy]
            #since each song has a key, ask if feature for keys should be num of songs that appear in that key or
            #just binary if any of their songs has that key or just be avg of songs with that key
            #same for mode, since its either major or minor...should it be count or avg.?
            sub_map['key'] = [hdf5_getters.get_key(h5,i)]
            sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)]
            sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            sub_map['song_hotttnesss'] = [s_hot]
            sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)]
            sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)]
            #should time signature be count as well? binary?
            sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)]
            sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)]
            #should year be binary since they can have many songs across years and should it be year:count
            sub_map['year'] = [yr]

            artist_map[artist_name] = sub_map
        else:
            #artist already exists, so get its map and update song fields
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            artist_map[artist_name]['danceability'].append(dance)
            artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i))
            artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i))
            artist_map[artist_name]['energy'].append(energy)
            artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i))
            artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i))
            artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            artist_map[artist_name]['song_hotttnesss'].append(s_hot)
            artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i))
            artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i))
            #should time signature be count as well? binary?
            artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i))
            artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i))
            #should year be binary since they can have many songs across years and should it be year:count
            artist_map[artist_name]['year'].append(yr)
# relevant metadata for all EDM songs found in the MSD 
all_song_data = {}
pitch_segs_data = []
count = 0
start_time = time.time()

''' preprocessing before actually running the dirichlet process takes much more time 
than running the dirichlet process on data '''

for root, dirs, files in os.walk(basedir):
    files = glob.glob(os.path.join(root,'*'+ext))
    for f in files:
        h5 = hdf5_getters.open_h5_file_read(f)
        # if year unknown, throw out sample
        if hdf5_getters.get_year(h5) == 0: 
            h5.close()
            continue
        if any(tag in str(hdf5_getters.get_artist_mbtags(h5)) for tag in target_genres): 
            print 'found electronic music song at {0} seconds'.format(time.time()-start_time)
            count += 1  
            chord_changes = [0 for i in range(0,192)]
            segments_pitches_old = hdf5_getters.get_segments_pitches(h5)
            segments_pitches_old_smoothed = []
            smoothing_factor = max(3,round(len(segments_pitches_old) * 60.0 / (hdf5_getters.get_tempo(h5) * hdf5_getters.get_duration(h5))))
            for i in range(0,int(math.floor(len(segments_pitches_old))/smoothing_factor)):
                segments = segments_pitches_old[(smoothing_factor*i):(smoothing_factor*i+smoothing_factor)]
                # calculate mean frequency of each note over a block of 5 time segments
                segments_mean = map(mean, zip(*segments))
                segments_pitches_old_smoothed.append(segments_mean)
            most_likely_chords = [msd_utils.find_most_likely_chord(seg) for seg in segments_pitches_old_smoothed]
def data_to_flat_file(basedir,ext='.h5') :
    """ This function extracts the information from the tables and creates the flat file. """
    count = 0; #song counter
    list_to_write= []
    group_index=0
    row_to_write = ""
    writer = csv.writer(open("complete.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
	    row=[]
	    print f
            h5 = hdf5_getters.open_h5_file_read(f)
	    title = hdf5_getters.get_title(h5) 
	    title= title.replace('"','') 
            row.append(title)
	    comma=title.find(',')
	    if	comma != -1:
		    print title
		    time.sleep(1)
	    album = hdf5_getters.get_release(h5)
	    album= album.replace('"','')
            row.append(album)
	    comma=album.find(',')
	    if	comma != -1:
		    print album
		    time.sleep(1)
	    artist_name = hdf5_getters.get_artist_name(h5)
	    comma=artist_name.find(',')
	    if	comma != -1:
		    print artist_name
		    time.sleep(1)
	    artist_name= artist_name.replace('"','')
            row.append(artist_name)
	    duration = hdf5_getters.get_duration(h5)
            row.append(duration)
	    samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
            row.append(samp_rt)
	    artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
            row.append(artist_7digitalid)
	    artist_fam = hdf5_getters.get_artist_familiarity(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_fam) == True:
	            artist_fam=-1
            row.append(artist_fam)
	    artist_hotness= hdf5_getters.get_artist_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_hotness) == True:
	             artist_hotness=-1
            row.append(artist_hotness)
	    artist_id = hdf5_getters.get_artist_id(h5)
            row.append(artist_id)           
	    artist_lat = hdf5_getters.get_artist_latitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lat) == True:
	            artist_lat=-1
            row.append(artist_lat)
	    artist_loc = hdf5_getters.get_artist_location(h5)
            row.append(artist_loc)
	    artist_lon = hdf5_getters.get_artist_longitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lon) == True:
	            artist_lon=-1
            row.append(artist_lon)
	    artist_mbid = hdf5_getters.get_artist_mbid(h5)
            row.append(artist_mbid)

	    #Getting the genre				       
            art_trm = hdf5_getters.get_artist_terms(h5)
            trm_freq = hdf5_getters.get_artist_terms_freq(h5)
	    trn_wght = hdf5_getters.get_artist_terms_weight(h5)
	    a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
	    genre_indexes=get_genre_indexes(trm_freq) 		    #index of the highest freq
	    genre_set=0					            #flag to see if the genre has been set or not
	    final_genre=[]
	    genres_so_far=[]
	    for i in range(len(genre_indexes)):
		    genre_tmp=get_genre(art_trm,genre_indexes[i])   #genre that corresponds to the highest freq
		    genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary
		    if len(genres_so_far) != 0:
			for i in genres_so_far:
				final_genre.append(i)
			    	genre_set=1
			
			
	    if genre_set == 1:
		col_num=[]
		for i in final_genre:
			column=int(i)				#getting the column number of the genre
			col_num.append(column)
	
		genre_array=genre_columns(col_num)	                #genre array 
	        for i in range(len(genre_array)):                   	#appending the genre_array to the row 
			row.append(genre_array[i])
	    else:
		genre_array=genre_columns(-1)				#when there is no genre matched, return an array of [0...0]
	        for i in range(len(genre_array)):                   	#appending the genre_array to the row 
			row.append(genre_array[i])
					

	    artist_pmid = hdf5_getters.get_artist_playmeid(h5)
            row.append(artist_pmid)
	    audio_md5 = hdf5_getters.get_audio_md5(h5)
            row.append(audio_md5)
	    danceability = hdf5_getters.get_danceability(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(danceability) == True:
	            danceability=-1
            row.append(danceability)
	    end_fade_in =hdf5_getters.get_end_of_fade_in(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(end_fade_in) == True:
	            end_fade_in=-1
            row.append(end_fade_in)
	    energy = hdf5_getters.get_energy(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(energy) == True:
	            energy=-1
            row.append(energy)
            song_key = hdf5_getters.get_key(h5)
            row.append(song_key)
	    key_c = hdf5_getters.get_key_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(key_c) == True:
	            key_c=-1
            row.append(key_c)
	    loudness = hdf5_getters.get_loudness(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(loudness) == True:
	            loudness=-1
            row.append(loudness)
	    mode = hdf5_getters.get_mode(h5)
            row.append(mode)
	    mode_conf = hdf5_getters.get_mode_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(mode_conf) == True:
	            mode_conf=-1
            row.append(mode_conf)
	    release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
            row.append(release_7digitalid)
	    song_hot = hdf5_getters.get_song_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(song_hot) == True:
	            song_hot=-1
            row.append(song_hot)
	    song_id = hdf5_getters.get_song_id(h5)
            row.append(song_id)
	    start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
            row.append(start_fade_out)
	    tempo = hdf5_getters.get_tempo(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(tempo) == True:
	            tempo=-1
            row.append(tempo)
	    time_sig = hdf5_getters.get_time_signature(h5)
            row.append(time_sig)
	    time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(time_sig_c) == True:
	            time_sig_c=-1
            row.append(time_sig_c)
	    track_id = hdf5_getters.get_track_id(h5)
            row.append(track_id)
	    track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
            row.append(track_7digitalid)
	    year = hdf5_getters.get_year(h5)
            row.append(year)
	    bars_c = hdf5_getters.get_bars_confidence(h5)
            bars_start = hdf5_getters.get_bars_start(h5)
	    row_bars_padding=padding(245)   #this is the array that will be attached at the end of th row

	    #--------------bars---------------"
	    gral_info=[]
	    gral_info=row[:]
	    empty=[]
	    for i,item in enumerate(bars_c):
                row.append(group_index)
                row.append(i)
                row.append(bars_c[i])
	        bars_c_avg= get_avg(bars_c)
                row.append(bars_c_avg)
	        bars_c_max= get_max(bars_c)	
                row.append(bars_c_max)
	        bars_c_min = get_min(bars_c)
                row.append(bars_c_min)
	        bars_c_stddev= get_stddev(bars_c)
                row.append(bars_c_stddev)
	        bars_c_count = get_count(bars_c)
                row.append(bars_c_count)
	        bars_c_sum = get_sum(bars_c)
                row.append(bars_c_sum)
                row.append(bars_start[i])	         
	        bars_start_avg = get_avg(bars_start)
                row.append(bars_start_avg)	         
	        bars_start_max= get_max(bars_start)
                row.append(bars_start_max)	         
	        bars_start_min = get_min(bars_start)
                row.append(bars_start_min)	         
	        bars_start_stddev= get_stddev(bars_start)
                row.append(bars_start_stddev)	         
	        bars_start_count = get_count(bars_start)
                row.append(bars_start_count)	         
	        bars_start_sum = get_sum(bars_start)
                row.append(bars_start_sum)	         
		for i in row_bars_padding:
			row.append(i)

                writer.writerow(row)
		row=[]
		row=gral_info[:]
	 

            #--------beats---------------"
	    beats_c = hdf5_getters.get_beats_confidence(h5)
	    group_index=1
	    row=[]
	    row=gral_info[:]
	    row_front=padding(14)  	#blanks left in front of the row(empty spaces for bars)
	    row_beats_padding=padding(231)
	    for i,item in enumerate(beats_c):
	   	row.append(group_index)
		row.append(i)
		for index in row_front:  #padding blanks in front of the beats
			row.append(index)
		
		row.append(beats_c[i])
	        beats_c_avg= get_avg(beats_c)
		row.append(beats_c_avg)
	        beats_c_max= get_max(beats_c)
		row.append(beats_c_max)
                beats_c_min = get_min(beats_c)
		row.append(beats_c_min)
	        beats_c_stddev= get_stddev(beats_c)
		row.append(beats_c_stddev)
	        beats_c_count = get_count(beats_c)
		row.append(beats_c_count)
	        beats_c_sum = get_sum(beats_c)
		row.append(beats_c_sum)
                beats_start = hdf5_getters.get_beats_start(h5)
		row.append(beats_start[i])
 	        beats_start_avg = get_avg(beats_start)
		row.append(beats_start_avg)
	        beats_start_max= get_max(beats_start)
		row.append(beats_start_max)
	        beats_start_min = get_min(beats_start)
		row.append(beats_start_min)
	        beats_start_stddev= get_stddev(beats_start)
		row.append(beats_start_stddev)
	        beats_start_count = get_count(beats_start)
		row.append(beats_start_count)
	        beats_start_sum = get_sum(beats_start)
		row.append(beats_start_sum)
		for i in row_beats_padding:
			row.append(i)
                
		writer.writerow(row)
		row=[]
		row=gral_info[:]

            # "--------sections---------------"
	    row_sec_padding=padding(217)	#blank spaces left at the end of the row
	    sec_c = hdf5_getters.get_sections_confidence(h5)
	    group_index=2
	    row=[]
	    row=gral_info[:]
	    row_front=padding(28)		#blank spaces left in front(empty spaces for bars,beats)
	    for i,item in enumerate(sec_c):
		row.append(group_index)
		row.append(i)
		for index in row_front:  	#padding blanks in front of the sections
			row.append(index)

		row.append(sec_c[i])
                sec_c_avg= get_avg(sec_c)
		row.append(sec_c_avg)
	        sec_c_max= get_max(sec_c)
		row.append(sec_c_max)
	        sec_c_min = get_min(sec_c)
		row.append(sec_c_min)
	        sec_c_stddev= get_stddev(sec_c)
		row.append(sec_c_stddev)
	        sec_c_count = get_count(sec_c)
		row.append(sec_c_count)
	        sec_c_sum = get_sum(sec_c)
		row.append(sec_c_sum)
	        sec_start = hdf5_getters.get_sections_start(h5)
		row.append(sec_start[i])	   
                sec_start_avg = get_avg(sec_start)
		row.append(sec_start_avg)
	        sec_start_max= get_max(sec_start)
		row.append(sec_start_max)
	        sec_start_min = get_min(sec_start)
		row.append(sec_start_min)
	        sec_start_stddev= get_stddev(sec_start)
		row.append(sec_start_stddev)
	        sec_start_count = get_count(sec_start)
		row.append(sec_start_count)
	        sec_start_sum = get_sum(sec_start)
		row.append(sec_start_sum)
		for i in row_sec_padding:	#appending the blank spaces at the end of the row
			row.append(i)
                

		writer.writerow(row)
		row=[]
		row=gral_info[:]


            #--------segments---------------"
	    row_seg_padding=padding(182)	#blank spaces at the end of the row
 	    row_front=padding(42)		#blank spaces left in front of segments
	    seg_c = hdf5_getters.get_segments_confidence(h5)
	    group_index=3
	    row=[]
	    row=gral_info[:]
	    for i,item in enumerate(seg_c):
		row.append(group_index)
		row.append(i)
		for index in row_front:  	#padding blanks in front of the segments
			row.append(index)

		row.append(seg_c[i])
                seg_c_avg= get_avg(seg_c)
		row.append(seg_c_avg)
	        seg_c_max= get_max(seg_c)
		row.append(seg_c_max)
	        seg_c_min = get_min(seg_c)
		row.append(seg_c_min)
	        seg_c_stddev= get_stddev(seg_c)
		row.append(seg_c_stddev)
	        seg_c_count = get_count(seg_c)
		row.append(seg_c_count)
	        seg_c_sum = get_sum(seg_c)
		row.append(seg_c_sum)
                seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
		row.append(seg_loud_max[i])
                seg_loud_max_avg= get_avg(seg_loud_max)
		row.append(seg_loud_max_avg)
	        seg_loud_max_max= get_max(seg_loud_max)
		row.append(seg_loud_max_max)
	        seg_loud_max_min = get_min(seg_loud_max)
		row.append(seg_loud_max_min)
	        seg_loud_max_stddev= get_stddev(seg_loud_max)
		row.append(seg_loud_max_stddev)
	        seg_loud_max_count = get_count(seg_loud_max)
		row.append(seg_loud_max_count)
	        seg_loud_max_sum = get_sum(seg_loud_max)
		row.append(seg_loud_max_sum)
	        seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
		row.append(seg_loud_max_time[i])
	        seg_loud_max_time_avg= get_avg(seg_loud_max_time)
		row.append(seg_loud_max_time_avg)
	        seg_loud_max_time_max= get_max(seg_loud_max_time)
		row.append(seg_loud_max_time_max)
	        seg_loud_max_time_min = get_min(seg_loud_max_time)
		row.append(seg_loud_max_time_min)
	        seg_loud_max_time_stddev= get_stddev(seg_loud_max_time)
		row.append(seg_loud_max_time_stddev)
	        seg_loud_max_time_count = get_count(seg_loud_max_time)
		row.append(seg_loud_max_time_count)
	        seg_loud_max_time_sum = get_sum(seg_loud_max_time)
		row.append(seg_loud_max_time_sum)
	        seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
		row.append(seg_loud_start[i])
	        seg_loud_start_avg= get_avg(seg_loud_start)
		row.append(seg_loud_start_avg)
	        seg_loud_start_max= get_max(seg_loud_start)
		row.append(seg_loud_start_max)
	        seg_loud_start_min = get_min(seg_loud_start)
		row.append(seg_loud_start_min)
	        seg_loud_start_stddev= get_stddev(seg_loud_start)
		row.append(seg_loud_start_stddev)
	        seg_loud_start_count = get_count(seg_loud_start)
		row.append(seg_loud_start_count)
	        seg_loud_start_sum = get_sum(seg_loud_start)					      
		row.append(seg_loud_start_sum)
	        seg_start = hdf5_getters.get_segments_start(h5)
		row.append(seg_start[i])
	        seg_start_avg= get_avg(seg_start)
		row.append(seg_start_avg)
	        seg_start_max= get_max(seg_start)
		row.append(seg_start_max)
	        seg_start_min = get_min(seg_start)
		row.append(seg_start_min)
	        seg_start_stddev= get_stddev(seg_start)
		row.append(seg_start_stddev)
	        seg_start_count = get_count(seg_start)
		row.append(seg_start_count)
	        seg_start_sum = get_sum(seg_start)
		row.append(seg_start_sum)
		for i in row_seg_padding:	#appending blank spaces at the end of the row
			row.append(i)
                
		writer.writerow(row)
		row=[]
		row=gral_info[:]

	    #----------segments pitch and timbre---------------"
	    row_seg2_padding=padding(14)	#blank spaces left at the end of the row
	    row_front=padding(77)		#blank spaces left at the front of the segments and timbre
	    seg_pitch = hdf5_getters.get_segments_pitches(h5)
	    transpose_pitch= seg_pitch.transpose()          #this is to tranpose the matrix,so we can have 12 rows
	    group_index=4
	    row=[]
	    row=gral_info[:]
	    for i,item in enumerate(transpose_pitch[0]):
		row.append(group_index)
		row.append(i)
		for index in row_front:  	#padding blanks in front of segments and timbre
			row.append(index)
	   
		row.append(transpose_pitch[0][i])
  		seg_pitch_avg= get_avg(transpose_pitch[0])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[0])	
		row.append(seg_pitch_max)
		seg_pitch_min = get_min(transpose_pitch[0])
		row.append(seg_pitch_min)
		seg_pitch_stddev= get_stddev(transpose_pitch[0])
		row.append(seg_pitch_stddev)
		seg_pitch_count = get_count(transpose_pitch[0])
		row.append(seg_pitch_count)
		seg_pitch_sum = get_sum(transpose_pitch[0])
		row.append(seg_pitch_sum)   
 		row.append(transpose_pitch[1][i])
 		seg_pitch_avg= get_avg(transpose_pitch[1])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[1])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[1])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[1])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[1])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[1])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[2][i])
 		seg_pitch_avg= get_avg(transpose_pitch[2])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[2])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[2])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[2])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[2])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[2])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[3][i])
 		seg_pitch_avg= get_avg(transpose_pitch[3])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[3])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[3])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[3])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[3])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[3])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[4][i])
 		seg_pitch_avg= get_avg(transpose_pitch[4])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[4])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[4])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[4])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[4])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[4])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[5][i])
 		seg_pitch_avg= get_avg(transpose_pitch[5])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[5])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[5])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[5])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[5])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[5])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[6][i])
 		seg_pitch_avg= get_avg(transpose_pitch[6])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[6])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[6])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[6])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[6])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[6])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[7][i])
 		seg_pitch_avg= get_avg(transpose_pitch[7])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[7])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[7])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[7])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[7])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[7])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[8][i])
 		seg_pitch_avg= get_avg(transpose_pitch[8])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[8])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[8])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[8])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[8])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[8])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[9][i])
 		seg_pitch_avg= get_avg(transpose_pitch[9])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[9])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[9])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[9])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[9])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[9])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[10][i])
 		seg_pitch_avg= get_avg(transpose_pitch[10])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[10])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[10])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[10])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[10])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[10])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[11][i])
 		seg_pitch_avg= get_avg(transpose_pitch[11])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[11])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[11])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[11])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[11])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[11])
		row.append(seg_pitch_sum)   
		#timbre arrays
	        seg_timbre = hdf5_getters.get_segments_timbre(h5)
                transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
		row.append(transpose_timbre[0][i])
  		seg_timbre_avg= get_avg(transpose_timbre[0])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[0])	
		row.append(seg_timbre_max)
		seg_timbre_min = get_min(transpose_timbre[0])
		row.append(seg_timbre_min)
		seg_timbre_stddev=get_stddev(transpose_timbre[0])
		row.append(seg_timbre_stddev)
		seg_timbre_count = get_count(transpose_timbre[0])
		row.append(seg_timbre_count)
		seg_timbre_sum = get_sum(transpose_timbre[0])
		row.append(seg_timbre_sum)   
 		row.append(transpose_timbre[1][i])
 		seg_timbre_avg= get_avg(transpose_timbre[1])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[1])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[1])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[1])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[1])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[1])
		row.append(seg_timbre_sum)   
		row.append(transpose_timbre[2][i])
 		seg_timbre_avg= get_avg(transpose_timbre[2])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[2])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[2])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[2])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[2])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[2])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[3][i])
 		seg_timbre_avg= get_avg(transpose_timbre[3])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[3])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[3])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[3])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[3])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[3])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[4][i])
 		seg_timbre_avg= get_avg(transpose_timbre[4])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[4])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[4])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[4])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[4])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[4])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[5][i])
 		seg_timbre_avg= get_avg(transpose_timbre[5])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[5])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[5])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[5])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[5])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[5])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[6][i])
 		seg_timbre_avg= get_avg(transpose_timbre[6])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[6])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[6])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[6])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[6])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[6])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[7][i])
 		seg_timbre_avg= get_avg(transpose_timbre[7])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[7])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[7])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[7])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[7])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[7])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[8][i])
 		seg_timbre_avg= get_avg(transpose_timbre[8])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[8])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[8])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[8])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[8])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[8])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[9][i])
 		seg_timbre_avg= get_avg(transpose_timbre[9])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[9])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[9])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[9])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[9])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[9])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[10][i])
 		seg_timbre_avg= get_avg(transpose_timbre[10])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[10])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[10])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[10])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[10])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[10])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[11][i])
 		seg_timbre_avg= get_avg(transpose_timbre[11])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[11])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[11])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[11])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[11])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[11])
		row.append(seg_timbre_sum)
	        for item in row_seg2_padding:
			row.append(item)
		writer.writerow(row)
		row=[]
		row=gral_info[:]


            # "--------tatums---------------"
	    tatms_c = hdf5_getters.get_tatums_confidence(h5)
	    group_index=5
	    row_front=padding(245)	#blank spaces left in front of tatums
	    row=[]
	    row=gral_info[:]
	    for i,item in enumerate(tatms_c):
		row.append(group_index)
		row.append(i)
		for item in row_front:	#appending blank spaces at the front of the row
			row.append(item)

		row.append(tatms_c[i])
		tatms_c_avg= get_avg(tatms_c)
		row.append(tatms_c_avg)
	 	tatms_c_max= get_max(tatms_c)
		row.append(tatms_c_max)
	        tatms_c_min = get_min(tatms_c)
		row.append(tatms_c_min)
	        tatms_c_stddev= get_stddev(tatms_c)
		row.append(tatms_c_stddev)
                tatms_c_count = get_count(tatms_c)
		row.append(tatms_c_count)
                tatms_c_sum = get_sum(tatms_c)
		row.append(tatms_c_sum)
                tatms_start = hdf5_getters.get_tatums_start(h5)
		row.append(tatms_start[i])
	        tatms_start_avg= get_avg(tatms_start)
		row.append(tatms_start_avg)
	        tatms_start_max= get_max(tatms_start)
		row.append(tatms_start_max)
	        tatms_start_min = get_min(tatms_start)
		row.append(tatms_start_min)
	        tatms_start_stddev= get_stddev(tatms_start)
		row.append(tatms_start_stddev)
	        tatms_start_count = get_count(tatms_start)
		row.append(tatms_start_count)
	        tatms_start_sum = get_sum(tatms_start)				   
		row.append(tatms_start_sum)
		writer.writerow(row)
		row=[]
		row=gral_info[:]


 
	    transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_pitch_avg=[]
	    seg_pitch_max=[]
	    seg_pitch_min=[]
            seg_pitch_stddev=[]
            seg_pitch_count=[]
	    seg_pitch_sum=[]
            i=0
	    #Getting the aggregate values in the pitches array
	    for row in transpose_pitch:
		   seg_pitch_avg.append(get_avg(row))
		   seg_pitch_max.append(get_max(row))
	           seg_pitch_min.append(get_min(row))
		   seg_pitch_stddev.append(get_stddev(row))
		   seg_pitch_count.append(get_count(row))
                   seg_pitch_sum.append(get_sum(row))
		   i=i+1

	    #extracting information from the timbre array 
            transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_timbre_avg=[]
	    seg_timbre_max=[]
	    seg_timbre_min=[]
            seg_timbre_stddev=[]
            seg_timbre_count=[]
	    seg_timbre_sum=[]
            i=0
	    for row in transpose_timbre:
		   seg_timbre_avg.append(get_avg(row))
		   seg_timbre_max.append(get_max(row))
	           seg_timbre_min.append(get_min(row))
		   seg_timbre_stddev.append(get_stddev(row))
		   seg_timbre_count.append(get_count(row))
                   seg_timbre_sum.append(get_sum(row))
		   i=i+1








	    h5.close()
	    count=count+1;
	    print count;
Example #36
0
def get_song_info(h5):
    print '%s - %s | (%s) | %s bpm' % (hdf5_getters.get_artist_name(h5), hdf5_getters.get_title(h5), hdf5_getters.get_year(h5), hdf5_getters.get_tempo(h5))
Example #37
0
def main():
    outputFile1 = open('SongCSV.csv', 'w')
    csvRowString = ""

    #################################################
    #if you want to prompt the user for the order of attributes in the csv,
    #leave the prompt boolean set to True
    #else, set 'prompt' to False and set the order of attributes in the 'else'
    #clause
    prompt = False
    #################################################
    if prompt == True:
        while prompt:

            prompt = False

            csvAttributeString = raw_input(
                "\n\nIn what order would you like the colums of the CSV file?\n"
                + "Please delineate with commas. The options are: " +
                "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"
                +
                " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo,"
                +
                " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n"
                +
                "For example, you may write \"Title, Tempo, Duration\"...\n\n"
                + "...or exit by typing 'exit'.\n\n")

            csvAttributeList = re.split('\W+', csvAttributeString)
            for i, v in enumerate(csvAttributeList):
                csvAttributeList[i] = csvAttributeList[i].lower()

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"

                if attribute == 'AlbumID'.lower():
                    csvRowString += 'AlbumID'
                elif attribute == 'AlbumName'.lower():
                    csvRowString += 'AlbumName'
                elif attribute == 'ArtistID'.lower():
                    csvRowString += 'ArtistID'
                elif attribute == 'ArtistLatitude'.lower():
                    csvRowString += 'ArtistLatitude'
                elif attribute == 'ArtistLocation'.lower():
                    csvRowString += 'ArtistLocation'
                elif attribute == 'ArtistLongitude'.lower():
                    csvRowString += 'ArtistLongitude'
                elif attribute == 'ArtistName'.lower():
                    csvRowString += 'ArtistName'
                elif attribute == 'Danceability'.lower():
                    csvRowString += 'Danceability'
                elif attribute == 'Duration'.lower():
                    csvRowString += 'Duration'
                elif attribute == 'KeySignature'.lower():
                    csvRowString += 'KeySignature'
                elif attribute == 'KeySignatureConfidence'.lower():
                    csvRowString += 'KeySignatureConfidence'
                elif attribute == 'SongID'.lower():
                    csvRowString += "SongID"
                elif attribute == 'Tempo'.lower():
                    csvRowString += 'Tempo'
                elif attribute == 'TimeSignature'.lower():
                    csvRowString += 'TimeSignature'
                elif attribute == 'TimeSignatureConfidence'.lower():
                    csvRowString += 'TimeSignatureConfidence'
                elif attribute == 'Title'.lower():
                    csvRowString += 'Title'
                elif attribute == 'Year'.lower():
                    csvRowString += 'Year'
                elif attribute == 'Exit'.lower():
                    sys.exit()
                else:
                    prompt = True
                    print "=============="
                    print "I believe there has been an error with the input."
                    print "=============="
                    break

                csvRowString += ","

            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex - 1]
            csvRowString += "\n"
            outputFile1.write(csvRowString)
            csvRowString = ""
    #else, if you want to hard code the order of the csv file and not prompt
    #the user,
    else:
        #################################################
        #change the order of the csv file here
        #Default is to list all available attributes (in alphabetical order)
        csvRowString = (
            "SongID,AlbumID,AlbumName,TrackId,ArtistID,ArtistLatitude,ArtistLocation,"
            +
            "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," +
            "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"
            + "Title,Year")
        #################################################

        csvAttributeList = re.split('\W+', csvRowString)
        for i, v in enumerate(csvAttributeList):
            csvAttributeList[i] = csvAttributeList[i].lower()
        outputFile1.write("SongNumber,")
        outputFile1.write(csvRowString + "\n")
        csvRowString = ""

    #################################################

    #Set the basedir here, the root directory from which the search
    #for files stored in a (hierarchical data structure) will originate
    basedir = "/home/umwangye/millonsong/MillionSongSubset/data/"  # "." As the default means the current directory
    ext = ".h5"  #Set the extension here. H5 is the extension for HDF5 files.
    #################################################

    #FOR LOOP
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print f

            songH5File = hdf5_getters.open_h5_file_read(f)
            #song = Song(str(hdf5_getters.get_song_id(songH5File)))

            #testDanceability = hdf5_getters.get_danceability(songH5File)
            # print type(testDanceability)
            # print ("Here is the danceability: ") + str(testDanceability)
            numPerH5 = hdf5_getters.get_num_songs(songH5File)

            for cnt in range(numPerH5):
                song = Song(str(hdf5_getters.get_song_id(songH5File, cnt)))
                song.trackId = str(hdf5_getters.get_track_id(songH5File, cnt))
                song.artistID = str(hdf5_getters.get_artist_id(
                    songH5File, cnt))
                song.albumID = str(
                    hdf5_getters.get_release_7digitalid(songH5File, cnt))
                song.albumName = str(hdf5_getters.get_release(songH5File, cnt))
                song.artistLatitude = str(
                    hdf5_getters.get_artist_latitude(songH5File, cnt))
                song.artistLocation = str(
                    hdf5_getters.get_artist_location(songH5File, cnt))
                song.artistLongitude = str(
                    hdf5_getters.get_artist_longitude(songH5File, cnt))
                song.artistName = str(
                    hdf5_getters.get_artist_name(songH5File, cnt))
                song.danceability = str(
                    hdf5_getters.get_danceability(songH5File, cnt))
                song.duration = str(hdf5_getters.get_duration(songH5File, cnt))
                # song.setGenreList()
                song.keySignature = str(hdf5_getters.get_key(songH5File, cnt))
                song.keySignatureConfidence = str(
                    hdf5_getters.get_key_confidence(songH5File, cnt))
                # song.lyrics = None
                # song.popularity = None
                song.tempo = str(hdf5_getters.get_tempo(songH5File, cnt))
                song.timeSignature = str(
                    hdf5_getters.get_time_signature(songH5File, cnt))
                song.timeSignatureConfidence = str(
                    hdf5_getters.get_time_signature_confidence(
                        songH5File, cnt))
                song.title = str(hdf5_getters.get_title(songH5File, cnt))
                song.year = str(hdf5_getters.get_year(songH5File, cnt))

                #print song count
                csvRowString += str(song.songCount) + ","

                for attribute in csvAttributeList:
                    # print "Here is the attribute: " + attribute + " \n"

                    if attribute == 'AlbumID'.lower():
                        csvRowString += song.albumID
                    elif attribute == 'AlbumName'.lower():
                        albumName = song.albumName
                        albumName = albumName.replace(',', "")
                        csvRowString += "\"" + albumName + "\""
                    elif attribute == 'TrackId'.lower():
                        csvRowString += song.trackId
                    elif attribute == 'ArtistID'.lower():
                        csvRowString += "\"" + song.artistID + "\""
                    elif attribute == 'ArtistLatitude'.lower():
                        latitude = song.artistLatitude
                        if latitude == 'nan':
                            latitude = ''
                        csvRowString += latitude
                    elif attribute == 'ArtistLocation'.lower():
                        location = song.artistLocation
                        location = location.replace(',', '')
                        csvRowString += "\"" + location + "\""
                    elif attribute == 'ArtistLongitude'.lower():
                        longitude = song.artistLongitude
                        if longitude == 'nan':
                            longitude = ''
                        csvRowString += longitude
                    elif attribute == 'ArtistName'.lower():
                        csvRowString += "\"" + song.artistName + "\""
                    elif attribute == 'Danceability'.lower():
                        csvRowString += song.danceability
                    elif attribute == 'Duration'.lower():
                        csvRowString += song.duration
                    elif attribute == 'KeySignature'.lower():
                        csvRowString += song.keySignature
                    elif attribute == 'KeySignatureConfidence'.lower():
                        # print "key sig conf: " + song.timeSignatureConfidence
                        csvRowString += song.keySignatureConfidence
                    elif attribute == 'SongID'.lower():
                        csvRowString += "\"" + song.id + "\""
                    elif attribute == 'Tempo'.lower():
                        # print "Tempo: " + song.tempo
                        csvRowString += song.tempo
                    elif attribute == 'TimeSignature'.lower():
                        csvRowString += song.timeSignature
                    elif attribute == 'TimeSignatureConfidence'.lower():
                        # print "time sig conf: " + song.timeSignatureConfidence
                        csvRowString += song.timeSignatureConfidence
                    elif attribute == 'Title'.lower():
                        csvRowString += "\"" + song.title + "\""
                    elif attribute == 'Year'.lower():
                        csvRowString += song.year

                    else:
                        csvRowString += "Erm. This didn't work. Error. :( :(\n"

                    csvRowString += ","

            #Remove the final comma from each row in the csv
                lastIndex = len(csvRowString)
                csvRowString = csvRowString[0:lastIndex - 1]
                csvRowString += "\n"
                outputFile1.write(csvRowString)
                csvRowString = ""

            songH5File.close()

    outputFile1.close()
def data_to_flat_file(basedir, ext='.h5'):
    """This function extract the information from the tables and creates the flat file."""
    count = 0
    #song counter
    list_to_write = []
    row_to_write = ""
    writer = csv.writer(open("metadata_wholeA.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print f  #the name of the file
            h5 = hdf5_getters.open_h5_file_read(f)
            title = hdf5_getters.get_title(h5)
            title = title.replace('"', '')
            comma = title.find(',')  #eliminating commas in the title
            if comma != -1:
                print title
                time.sleep(1)
            album = hdf5_getters.get_release(h5)
            album = album.replace('"', '')  #eliminating commas in the album
            comma = album.find(',')
            if comma != -1:
                print album
                time.sleep(1)
            artist_name = hdf5_getters.get_artist_name(h5)
            comma = artist_name.find(',')
            if comma != -1:
                print artist_name
                time.sleep(1)
            artist_name = artist_name.replace('"',
                                              '')  #eliminating double quotes
            duration = hdf5_getters.get_duration(h5)
            samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
            artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
            artist_fam = hdf5_getters.get_artist_familiarity(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_fam) == True:
                artist_fam = -1
            artist_hotness = hdf5_getters.get_artist_hotttnesss(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_hotness) == True:
                artist_hotness = -1
            artist_id = hdf5_getters.get_artist_id(h5)
            artist_lat = hdf5_getters.get_artist_latitude(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_lat) == True:
                artist_lat = -1
            artist_loc = hdf5_getters.get_artist_location(h5)
            #checks artist_loc to see if it is a hyperlink if it is set as empty string
            artist_loc = artist_loc.replace(",", "\,")
            if artist_loc.startswith("<a"):
                artist_loc = ""
            if len(artist_loc) > 100:
                artist_loc = ""
            artist_lon = hdf5_getters.get_artist_longitude(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_lon) == True:
                artist_lon = -1
            artist_mbid = hdf5_getters.get_artist_mbid(h5)
            artist_pmid = hdf5_getters.get_artist_playmeid(h5)
            audio_md5 = hdf5_getters.get_audio_md5(h5)
            danceability = hdf5_getters.get_danceability(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(danceability) == True:
                danceability = -1
            end_fade_in = hdf5_getters.get_end_of_fade_in(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(end_fade_in) == True:
                end_fade_in = -1
            energy = hdf5_getters.get_energy(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(energy) == True:
                energy = -1
            song_key = hdf5_getters.get_key(h5)
            key_c = hdf5_getters.get_key_confidence(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(key_c) == True:
                key_c = -1
            loudness = hdf5_getters.get_loudness(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(loudness) == True:
                loudness = -1
            mode = hdf5_getters.get_mode(h5)
            mode_conf = hdf5_getters.get_mode_confidence(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(mode_conf) == True:
                mode_conf = -1
            release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
            song_hot = hdf5_getters.get_song_hotttnesss(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(song_hot) == True:
                song_hot = -1
            song_id = hdf5_getters.get_song_id(h5)
            start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
            tempo = hdf5_getters.get_tempo(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(tempo) == True:
                tempo = -1
            time_sig = hdf5_getters.get_time_signature(h5)
            time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(time_sig_c) == True:
                time_sig_c = -1
            track_id = hdf5_getters.get_track_id(h5)
            track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
            year = hdf5_getters.get_year(h5)
            bars_c = hdf5_getters.get_bars_confidence(h5)
            bars_c_avg = get_avg(bars_c)
            bars_c_max = get_max(bars_c)
            bars_c_min = get_min(bars_c)
            bars_c_stddev = get_stddev(bars_c)
            bars_c_count = get_count(bars_c)
            bars_c_sum = get_sum(bars_c)
            bars_start = hdf5_getters.get_bars_start(h5)
            bars_start_avg = get_avg(bars_start)
            bars_start_max = get_max(bars_start)
            bars_start_min = get_min(bars_start)
            bars_start_stddev = get_stddev(bars_start)
            bars_start_count = get_count(bars_start)
            bars_start_sum = get_sum(bars_start)
            beats_c = hdf5_getters.get_beats_confidence(h5)
            beats_c_avg = get_avg(beats_c)
            beats_c_max = get_max(beats_c)
            beats_c_min = get_min(beats_c)
            beats_c_stddev = get_stddev(beats_c)
            beats_c_count = get_count(beats_c)
            beats_c_sum = get_sum(beats_c)
            beats_start = hdf5_getters.get_beats_start(h5)
            beats_start_avg = get_avg(beats_start)
            beats_start_max = get_max(beats_start)
            beats_start_min = get_min(beats_start)
            beats_start_stddev = get_stddev(beats_start)
            beats_start_count = get_count(beats_start)
            beats_start_sum = get_sum(beats_start)
            sec_c = hdf5_getters.get_sections_confidence(h5)
            sec_c_avg = get_avg(sec_c)
            sec_c_max = get_max(sec_c)
            sec_c_min = get_min(sec_c)
            sec_c_stddev = get_stddev(sec_c)
            sec_c_count = get_count(sec_c)
            sec_c_sum = get_sum(sec_c)
            sec_start = hdf5_getters.get_sections_start(h5)
            sec_start_avg = get_avg(sec_start)
            sec_start_max = get_max(sec_start)
            sec_start_min = get_min(sec_start)
            sec_start_stddev = get_stddev(sec_start)
            sec_start_count = get_count(sec_start)
            sec_start_sum = get_sum(sec_start)
            seg_c = hdf5_getters.get_segments_confidence(h5)
            seg_c_avg = get_avg(seg_c)
            seg_c_max = get_max(seg_c)
            seg_c_min = get_min(seg_c)
            seg_c_stddev = get_stddev(seg_c)
            seg_c_count = get_count(seg_c)
            seg_c_sum = get_sum(seg_c)
            seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
            seg_loud_max_avg = get_avg(seg_loud_max)
            seg_loud_max_max = get_max(seg_loud_max)
            seg_loud_max_min = get_min(seg_loud_max)
            seg_loud_max_stddev = get_stddev(seg_loud_max)
            seg_loud_max_count = get_count(seg_loud_max)
            seg_loud_max_sum = get_sum(seg_loud_max)
            seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
            seg_loud_max_time_avg = get_avg(seg_loud_max_time)
            seg_loud_max_time_max = get_max(seg_loud_max_time)
            seg_loud_max_time_min = get_min(seg_loud_max_time)
            seg_loud_max_time_stddev = get_stddev(seg_loud_max_time)
            seg_loud_max_time_count = get_count(seg_loud_max_time)
            seg_loud_max_time_sum = get_sum(seg_loud_max_time)
            seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
            seg_loud_start_avg = get_avg(seg_loud_start)
            seg_loud_start_max = get_max(seg_loud_start)
            seg_loud_start_min = get_min(seg_loud_start)
            seg_loud_start_stddev = get_stddev(seg_loud_start)
            seg_loud_start_count = get_count(seg_loud_start)
            seg_loud_start_sum = get_sum(seg_loud_start)
            seg_pitch = hdf5_getters.get_segments_pitches(h5)
            pitch_size = len(seg_pitch)
            seg_start = hdf5_getters.get_segments_start(h5)
            seg_start_avg = get_avg(seg_start)
            seg_start_max = get_max(seg_start)
            seg_start_min = get_min(seg_start)
            seg_start_stddev = get_stddev(seg_start)
            seg_start_count = get_count(seg_start)
            seg_start_sum = get_sum(seg_start)
            seg_timbre = hdf5_getters.get_segments_timbre(h5)
            tatms_c = hdf5_getters.get_tatums_confidence(h5)
            tatms_c_avg = get_avg(tatms_c)
            tatms_c_max = get_max(tatms_c)
            tatms_c_min = get_min(tatms_c)
            tatms_c_stddev = get_stddev(tatms_c)
            tatms_c_count = get_count(tatms_c)
            tatms_c_sum = get_sum(tatms_c)
            tatms_start = hdf5_getters.get_tatums_start(h5)
            tatms_start_avg = get_avg(tatms_start)
            tatms_start_max = get_max(tatms_start)
            tatms_start_min = get_min(tatms_start)
            tatms_start_stddev = get_stddev(tatms_start)
            tatms_start_count = get_count(tatms_start)
            tatms_start_sum = get_sum(tatms_start)

            #Getting the genres
            genre_set = 0  #flag to see if the genre has been set or not
            art_trm = hdf5_getters.get_artist_terms(h5)
            trm_freq = hdf5_getters.get_artist_terms_freq(h5)
            trn_wght = hdf5_getters.get_artist_terms_weight(h5)
            a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
            genre_indexes = get_genre_indexes(
                trm_freq)  #index of the highest freq
            final_genre = []
            genres_so_far = []
            for i in range(len(genre_indexes)):
                genre_tmp = get_genre(
                    art_trm, genre_indexes[i]
                )  #genre that corresponds to the highest freq
                genres_so_far = genre_dict.get_genre_in_dict(
                    genre_tmp)  #getting the genre from the dictionary
                if len(genres_so_far) != 0:
                    for i in genres_so_far:
                        final_genre.append(i)
                        genre_set = 1  #genre was found in dictionary

            if genre_set == 1:
                col_num = []

                for genre in final_genre:
                    column = int(
                        genre)  #getting the column number of the genre
                    col_num.append(column)

                genre_array = genre_columns(col_num)  #genre array
            else:
                genre_array = genre_columns(
                    -1)  #the genre was not found in the dictionary

            transpose_pitch = seg_pitch.transpose(
            )  #this is to tranpose the matrix,so we can have 12 rows
            #arrays containing the aggregate values of the 12 rows
            seg_pitch_avg = []
            seg_pitch_max = []
            seg_pitch_min = []
            seg_pitch_stddev = []
            seg_pitch_count = []
            seg_pitch_sum = []
            i = 0
            #Getting the aggregate values in the pitches array
            for row in transpose_pitch:
                seg_pitch_avg.append(get_avg(row))
                seg_pitch_max.append(get_max(row))
                seg_pitch_min.append(get_min(row))
                seg_pitch_stddev.append(get_stddev(row))
                seg_pitch_count.append(get_count(row))
                seg_pitch_sum.append(get_sum(row))
                i = i + 1

            #extracting information from the timbre array
            transpose_timbre = seg_pitch.transpose(
            )  #tranposing matrix, to have 12 rows
            #arrays containing the aggregate values of the 12 rows
            seg_timbre_avg = []
            seg_timbre_max = []
            seg_timbre_min = []
            seg_timbre_stddev = []
            seg_timbre_count = []
            seg_timbre_sum = []
            i = 0
            for row in transpose_timbre:
                seg_timbre_avg.append(get_avg(row))
                seg_timbre_max.append(get_max(row))
                seg_timbre_min.append(get_min(row))
                seg_timbre_stddev.append(get_stddev(row))
                seg_timbre_count.append(get_count(row))
                seg_timbre_sum.append(get_sum(row))
                i = i + 1

        #Writing to the flat file
            writer.writerow([
                title, album, artist_name, year, duration, seg_start_count,
                tempo
            ])

            h5.close()
            count = count + 1
            print count
def func_to_extract_features(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    global cntnan	
    global listfeatures

    cf = []
    h5 = GETTERS.open_h5_file_read(filename)
    nanfound = 0

    #Get target feature: song hotness

    #FEATURE 0
    song_hotness = GETTERS.get_song_hotttnesss(h5)
    if math.isnan(song_hotness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       if song_hotness <= 0.2:
          song_hotness_class = 0
       elif song_hotness <= 0.4:
          song_hotness_class = 1
       elif song_hotness <= 0.6:
          song_hotness_class = 2
       elif song_hotness <= 0.8:
          song_hotness_class = 3
       else:
          song_hotness_class = 4

       cf.append(song_hotness_class)

    #FEATURE 1
    #Get song loudness
    song_loudness = GETTERS.get_loudness(h5)
    
    if math.isnan(song_loudness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_loudness)

    #FEATURE 2 
    #Get song year
    song_year = GETTERS.get_year(h5)
    if song_year == 0:
       nanfound = 1
       cntnan = cntnan + 1
    else:
      cf.append(song_year)

    #FEATURE 3
    #Get song tempo
    song_tempo = GETTERS.get_tempo(h5)
    cf.append(song_tempo)

    #Feature 4 
    #Artist familarity
    artist_familiarity = GETTERS.get_artist_familiarity(h5)
    cf.append(artist_familiarity)

    #Feature 5 
    artist_hotness = GETTERS.get_artist_hotttnesss(h5)
    if math.isnan(artist_hotness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(artist_hotness)

    if nanfound == 0:
       strlist = list_to_csv(cf)
       listfeatures.append(strlist)

    h5.close()
Example #40
0
def process_filelist_test(filelist=None,
                          model=None,
                          tmpfilename=None,
                          npicks=None,
                          winsize=None,
                          finaldim=None,
                          K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model', model, 'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[
        1] == finaldim, 'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'year_real',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year_pred',
                        tables.Float64Atom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #' + str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0:  # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats, kd, h5model, K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append([year])
            output.root.data.year_pred.append([year_pred])
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return
def func_to_extract_features(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    global cntnan	
    global listfeatures

    cf = []
    h5 = GETTERS.open_h5_file_read(filename)
    nanfound = 0

    #Get target feature: song hotness

    #FEATURE 0
    song_hotness = GETTERS.get_song_hotttnesss(h5)
    if math.isnan(song_hotness):
       nanfound = 1
       cntnan = cntnan + 1
       h5.close()
       return 0
    elif song_hotness > 0.3 and song_hotness < 0.6:
         h5.close()
         return 0
    else:
       if song_hotness <= 0.3:
	  hotness_class = 0
       elif song_hotness >= 0.6:
	  hotness_class = 1
       cf.append(hotness_class)

    #FEATURE 1
    #Get song loudness
    song_loudness = GETTERS.get_loudness(h5)
    
    if math.isnan(song_loudness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
      #cf.append(song_loudness)
      pass

    #FEATURE 2
    #Get key of the song
    song_key = GETTERS.get_key(h5)
    if math.isnan(song_key):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#        cf.append(song_key)
         pass

    #FEATURE 3

    song_duration = GETTERS.get_duration(h5)
    if math.isnan(song_duration):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#       cf.append(song_duration)
        pass

    #Feature 4
    #Get song tempo
    song_tempo = GETTERS.get_tempo(h5)
    if math.isnan(song_tempo):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#       cf.append(song_tempo)
        pass

    #Feature 5: artist familarity 
    artist_familiarity = GETTERS.get_artist_familiarity(h5)
    if math.isnan(artist_familiarity):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#       cf.append(artist_familiarity)
       pass

    #Feature 6: artist_hotness
    artist_hotness = GETTERS.get_artist_hotttnesss(h5)
    if math.isnan(artist_hotness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#        cf.append(artist_hotness)
         pass

    #Feature 7 time signature
    time_signature = GETTERS.get_time_signature(h5)
#   cf.append(time_signature)

    #Feature 8
    #Loudness COV
    loudness_segments = np.array(GETTERS.get_segments_loudness_max(h5))
    loudness_cov = abs(variation(loudness_segments))
    if math.isnan(loudness_cov):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#      cf.append(loudness_cov)
       pass

    #Feature 9
    #Beat COV
    beat_segments = np.array(GETTERS.get_beats_start(h5))
    beat_cov = abs(variation(beat_segments))
    if math.isnan(beat_cov):
       nanfound = 1
       cntnan = cntnan + 1
    else:
#        cf.append(beat_cov)
        pass

    #Feature 10
    #Year
    song_year = GETTERS.get_year(h5)
    if song_year == 0:
       nanfound = 1
       cntnan = cntnan + 1
    else:
#       cf.append(song_year)
        pass
       

    title = GETTERS.get_title(h5)
    if title in energydict:
       audio_summary = energydict[title]
       energy = audio_summary['energy']
       danceability = audio_summary['danceability']
       speechiness = audio_summary['speechiness']
       liveness = audio_summary['liveness']
    else:
       stitle = re.sub(r'\([^)]*\)','', title)
       if stitle in energydict:
          audio_summary = energydict[stitle]

          energy = audio_summary['energy']
          danceability = audio_summary['danceability']
          speechiness = audio_summary['speechiness']
          liveness = audio_summary['liveness']
       else:
	  energy = 0.0
          danceability = 0.0
          speechiness = 0.0
          liveness = 0.0

    # Feature 11
    cf.append(energy)
    # Feature 12
#    cf.append(danceability)
    # Feature 13
#    cf.append(speechiness)
    # Feature 14
#    cf.append(liveness)

    if nanfound == 0:
       strlist = list_to_csv(cf)
       listfeatures.append(strlist)

    h5.close()
def process_filelist_train(filelist=None,testartists=None,tmpfilename=None,
                           npicks=None,winsize=None,finaldim=None,typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,finaldim),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'year',tables.IntAtom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'track_id',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12 # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False,'Unknown type of compression: '+str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #',cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj)
        else:
            assert False,'Unknown type of compression: '+str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append( np.array( [year] * n_p_feats ) )
        output.root.data.track_id.append( np.array( [track_id] * n_p_feats ) )
        output.root.data.feats.append( processed_feats )
    # we're done, close output
    output.close()
    return
Example #43
0
def process_filelist_train(filelist=None,
                           testartists=None,
                           tmpfilename=None,
                           npicks=None,
                           winsize=None,
                           finaldim=None,
                           typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'feats',
                        tables.Float64Atom(shape=()), (0, finaldim),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'track_id',
                        tables.StringAtom(18, shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #', cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append(np.array([year] * n_p_feats))
        output.root.data.track_id.append(np.array([track_id] * n_p_feats))
        output.root.data.feats.append(processed_feats)
    # we're done, close output
    output.close()
    return
# This script converts the summary H5 files only 300MB to a csv file
# Run only on the Master Node since h5_getters cannot open a remote(ie. HDFS) file

if __name__ == "__main__":

    with open("fields.csv", "wb") as f:
        writer = csv.writer(f)  # initialize the csv writer

        # for each track in the summary file, get the 11 fields and output to csv
        h5_file = hdf5_getters.open_h5_file_read("msd_summary_file.h5")
        for k in range(1000000):
            print "index!!!: ", k
            id = hdf5_getters.get_track_id(h5_file, k)  # get track_id TRA13e39..
            title = hdf5_getters.get_title(h5_file, k)  # get song title
            artist_name = hdf5_getters.get_artist_name(h5_file, k)
            year = int(hdf5_getters.get_year(h5_file, k))
            hotness = float(hdf5_getters.get_song_hotttnesss(h5_file, k))
            artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file, k))
            f5 = int(hdf5_getters.get_key(h5_file, k))  # get key
            f2 = float(hdf5_getters.get_loudness(h5_file, k))  # get loudness
            f1 = float(hdf5_getters.get_tempo(h5_file, k))  # get tempo
            f4 = int(hdf5_getters.get_duration(h5_file, k))  # get duration
            f3 = float(hdf5_getters.get_time_signature(h5_file, k))  # get time signature

            # Get rid of missing info and change invalid numbers for meta data

            if not artist_name:
                artist_name = "unknown"

            if not artist_familiarity:
                artist_familiarity = 0.0
def data_to_flat_file(basedir,ext='.h5') :
    """This function extract the information from the tables and creates the flat file."""	
    count = 0;	#song counter
    list_to_write= []
    row_to_write = ""
    writer = csv.writer(open("metadata_wholeA.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
	    print f	#the name of the file
            h5 = hdf5_getters.open_h5_file_read(f)
	    title = hdf5_getters.get_title(h5) 
	    title= title.replace('"','') 
	    comma=title.find(',')	#eliminating commas in the title
	    if	comma != -1:
		    print title
		    time.sleep(1)
	    album = hdf5_getters.get_release(h5)
	    album= album.replace('"','')	#eliminating commas in the album	
	    comma=album.find(',')
	    if	comma != -1:
		    print album
		    time.sleep(1)
	    artist_name = hdf5_getters.get_artist_name(h5)
	    comma=artist_name.find(',')
	    if	comma != -1:
		    print artist_name
		    time.sleep(1)
	    artist_name= artist_name.replace('"','')	#eliminating double quotes
	    duration = hdf5_getters.get_duration(h5)
	    samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
	    artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
	    artist_fam = hdf5_getters.get_artist_familiarity(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_fam) == True:
	            artist_fam=-1
	    artist_hotness= hdf5_getters.get_artist_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_hotness) == True:
	            artist_hotness=-1
	    artist_id = hdf5_getters.get_artist_id(h5)
	    artist_lat = hdf5_getters.get_artist_latitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lat) == True:
	            artist_lat=-1
	    artist_loc = hdf5_getters.get_artist_location(h5)
		#checks artist_loc to see if it is a hyperlink if it is set as empty string
	    artist_loc = artist_loc.replace(",", "\,");
	    if artist_loc.startswith("<a"):
                artist_loc = ""
	    if len(artist_loc) > 100:
                artist_loc = ""
	    artist_lon = hdf5_getters.get_artist_longitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lon) == True:
	            artist_lon=-1
	    artist_mbid = hdf5_getters.get_artist_mbid(h5)
	    artist_pmid = hdf5_getters.get_artist_playmeid(h5)
	    audio_md5 = hdf5_getters.get_audio_md5(h5)
	    danceability = hdf5_getters.get_danceability(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(danceability) == True:
	            danceability=-1
	    end_fade_in =hdf5_getters.get_end_of_fade_in(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(end_fade_in) == True:
	            end_fade_in=-1
	    energy = hdf5_getters.get_energy(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(energy) == True:
	            energy=-1
            song_key = hdf5_getters.get_key(h5)
	    key_c = hdf5_getters.get_key_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(key_c) == True:
	            key_c=-1
	    loudness = hdf5_getters.get_loudness(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(loudness) == True:
	            loudness=-1
	    mode = hdf5_getters.get_mode(h5)
	    mode_conf = hdf5_getters.get_mode_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(mode_conf) == True:
	            mode_conf=-1
	    release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
	    song_hot = hdf5_getters.get_song_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(song_hot) == True:
	            song_hot=-1
	    song_id = hdf5_getters.get_song_id(h5)
	    start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
	    tempo = hdf5_getters.get_tempo(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(tempo) == True:
	            tempo=-1
	    time_sig = hdf5_getters.get_time_signature(h5)
	    time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(time_sig_c) == True:
	            time_sig_c=-1
	    track_id = hdf5_getters.get_track_id(h5)
	    track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
	    year = hdf5_getters.get_year(h5)
	    bars_c = hdf5_getters.get_bars_confidence(h5)
	    bars_c_avg= get_avg(bars_c)
	    bars_c_max= get_max(bars_c)
	    bars_c_min = get_min(bars_c)
	    bars_c_stddev= get_stddev(bars_c)
	    bars_c_count = get_count(bars_c)
	    bars_c_sum = get_sum(bars_c)
	    bars_start = hdf5_getters.get_bars_start(h5)
	    bars_start_avg = get_avg(bars_start)
	    bars_start_max= get_max(bars_start)
	    bars_start_min = get_min(bars_start)
	    bars_start_stddev= get_stddev(bars_start)
	    bars_start_count = get_count(bars_start)
	    bars_start_sum = get_sum(bars_start)
            beats_c = hdf5_getters.get_beats_confidence(h5)
            beats_c_avg= get_avg(beats_c)
	    beats_c_max= get_max(beats_c)
	    beats_c_min = get_min(beats_c)
	    beats_c_stddev= get_stddev(beats_c)
	    beats_c_count = get_count(beats_c)
	    beats_c_sum = get_sum(beats_c)
            beats_start = hdf5_getters.get_beats_start(h5)
 	    beats_start_avg = get_avg(beats_start)
	    beats_start_max= get_max(beats_start)
	    beats_start_min = get_min(beats_start)
	    beats_start_stddev= get_stddev(beats_start)
	    beats_start_count = get_count(beats_start)
	    beats_start_sum = get_sum(beats_start)
	    sec_c = hdf5_getters.get_sections_confidence(h5)
            sec_c_avg= get_avg(sec_c)
	    sec_c_max= get_max(sec_c)
	    sec_c_min = get_min(sec_c)
	    sec_c_stddev= get_stddev(sec_c)
	    sec_c_count = get_count(sec_c)
	    sec_c_sum = get_sum(sec_c)
	    sec_start = hdf5_getters.get_sections_start(h5)
            sec_start_avg = get_avg(sec_start)
	    sec_start_max= get_max(sec_start)
	    sec_start_min = get_min(sec_start)
	    sec_start_stddev= get_stddev(sec_start)
	    sec_start_count = get_count(sec_start)
	    sec_start_sum = get_sum(sec_start)
	    seg_c = hdf5_getters.get_segments_confidence(h5)
	    seg_c_avg= get_avg(seg_c)
	    seg_c_max= get_max(seg_c)
	    seg_c_min = get_min(seg_c)
	    seg_c_stddev= get_stddev(seg_c)
	    seg_c_count = get_count(seg_c)
	    seg_c_sum = get_sum(seg_c)
            seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
            seg_loud_max_avg= get_avg(seg_loud_max)
	    seg_loud_max_max= get_max(seg_loud_max)
	    seg_loud_max_min = get_min(seg_loud_max)
	    seg_loud_max_stddev= get_stddev(seg_loud_max)
	    seg_loud_max_count = get_count(seg_loud_max)
	    seg_loud_max_sum = get_sum(seg_loud_max)
	    seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
	    seg_loud_max_time_avg= get_avg(seg_loud_max_time)
	    seg_loud_max_time_max= get_max(seg_loud_max_time)
	    seg_loud_max_time_min = get_min(seg_loud_max_time)
	    seg_loud_max_time_stddev= get_stddev(seg_loud_max_time)
	    seg_loud_max_time_count = get_count(seg_loud_max_time)
	    seg_loud_max_time_sum = get_sum(seg_loud_max_time)
	    seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
	    seg_loud_start_avg= get_avg(seg_loud_start)
	    seg_loud_start_max= get_max(seg_loud_start)
	    seg_loud_start_min = get_min(seg_loud_start)
	    seg_loud_start_stddev= get_stddev(seg_loud_start)
	    seg_loud_start_count = get_count(seg_loud_start)
	    seg_loud_start_sum = get_sum(seg_loud_start)					      
	    seg_pitch = hdf5_getters.get_segments_pitches(h5)
	    pitch_size = len(seg_pitch)
	    seg_start = hdf5_getters.get_segments_start(h5)
	    seg_start_avg= get_avg(seg_start)
	    seg_start_max= get_max(seg_start)
	    seg_start_min = get_min(seg_start)
	    seg_start_stddev= get_stddev(seg_start)
	    seg_start_count = get_count(seg_start)
	    seg_start_sum = get_sum(seg_start)
	    seg_timbre = hdf5_getters.get_segments_timbre(h5)
	    tatms_c = hdf5_getters.get_tatums_confidence(h5)
	    tatms_c_avg= get_avg(tatms_c)
	    tatms_c_max= get_max(tatms_c)
	    tatms_c_min = get_min(tatms_c)
	    tatms_c_stddev= get_stddev(tatms_c)
	    tatms_c_count = get_count(tatms_c)
	    tatms_c_sum = get_sum(tatms_c)
	    tatms_start = hdf5_getters.get_tatums_start(h5)
	    tatms_start_avg= get_avg(tatms_start)
	    tatms_start_max= get_max(tatms_start)
	    tatms_start_min = get_min(tatms_start)
	    tatms_start_stddev= get_stddev(tatms_start)
	    tatms_start_count = get_count(tatms_start)
	    tatms_start_sum = get_sum(tatms_start)
	
	    #Getting the genres
	    genre_set = 0    #flag to see if the genre has been set or not
	    art_trm = hdf5_getters.get_artist_terms(h5)
	    trm_freq = hdf5_getters.get_artist_terms_freq(h5)
	    trn_wght = hdf5_getters.get_artist_terms_weight(h5)
	    a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
	    genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq
	    final_genre=[]
	    genres_so_far=[]
	    for i in range(len(genre_indexes)):
		    genre_tmp=get_genre(art_trm,genre_indexes[i])   #genre that corresponds to the highest freq
		    genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary
		    if len(genres_so_far) != 0:
			    for i in genres_so_far:
				final_genre.append(i)
				genre_set=1				#genre was found in dictionary
				  
		
	    
	    if genre_set == 1:
		    col_num=[]
		   
		    for genre in final_genre:
			    column=int(genre)				#getting the column number of the genre
			    col_num.append(column)

		    genre_array=genre_columns(col_num)	         #genre array
 	    else:
		    genre_array=genre_columns(-1)		#the genre was not found in the dictionary

	    transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_pitch_avg=[]
	    seg_pitch_max=[]
	    seg_pitch_min=[]
            seg_pitch_stddev=[]
            seg_pitch_count=[]
	    seg_pitch_sum=[]
            i=0
	    #Getting the aggregate values in the pitches array
	    for row in transpose_pitch:
		   seg_pitch_avg.append(get_avg(row))
		   seg_pitch_max.append(get_max(row))
	           seg_pitch_min.append(get_min(row))
		   seg_pitch_stddev.append(get_stddev(row))
		   seg_pitch_count.append(get_count(row))
                   seg_pitch_sum.append(get_sum(row))
		   i=i+1

	    #extracting information from the timbre array 
            transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_timbre_avg=[]
	    seg_timbre_max=[]
	    seg_timbre_min=[]
            seg_timbre_stddev=[]
            seg_timbre_count=[]
	    seg_timbre_sum=[]
            i=0
	    for row in transpose_timbre:
		   seg_timbre_avg.append(get_avg(row))
		   seg_timbre_max.append(get_max(row))
	           seg_timbre_min.append(get_min(row))
		   seg_timbre_stddev.append(get_stddev(row))
		   seg_timbre_count.append(get_count(row))
                   seg_timbre_sum.append(get_sum(row))
		   i=i+1
		


		#Writing to the flat file
            writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo])

	    h5.close()
	    count=count+1;
	    print count;
Example #46
0
#!/usr/bin/env python

from context import *
from settings.filemgmt import fileManager
from settings.paths import MSD_TID_YEAR, MSD_FILES, sep
from hdf5_getters import open_h5_file_read, get_year

if __name__ == '__main__':

    hdf5Files = fileManager(MSD_FILES, 'r').split('\n')

    msdData = []

    for file in hdf5Files:
        h5 = open_h5_file_read(file)
        title = str(file.split('/')[-1].partition('.')[0])
        year = get_year(h5)
        if year > 1960:
            msdData.append(
                title +
                sep + str(year)
            )
        h5.close()

    fileManager(MSD_TID_YEAR, 'w', '\n'.join(msdData))
def writeSingleHDF5FileToTxtFile(songHDF5FileName):
    global maximumArtistNameLen
    global maximumArtistTagLen
    global maximumSongNameLen
    global maximumAlbumNameLen
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    songHDF5File = GETTERS.open_h5_file_read(songHDF5FileName)

    songID = GETTERS.get_song_id(songHDF5File)
    songName = GETTERS.get_title(songHDF5File)
    artistID = GETTERS.get_artist_id(songHDF5File)
    songAlbum = GETTERS.get_release(songHDF5File)
    songYear = GETTERS.get_year(songHDF5File)
    songTempo = GETTERS.get_tempo(songHDF5File)
    songDanceability = GETTERS.get_danceability(songHDF5File)
    songDuration = GETTERS.get_duration(songHDF5File)
    songEnergy = GETTERS.get_energy(songHDF5File)
    songKey = GETTERS.get_key(songHDF5File)
    songLoudness = GETTERS.get_loudness(songHDF5File)
    songMode = GETTERS.get_mode(songHDF5File)
    songTimeSignature = GETTERS.get_time_signature(songHDF5File)

    songsTableFile.write(songID + "\t" + songName + "\t" + artistID + "\t" +
                         songAlbum + "\t" + str(songYear) + "\t" +
                         str(songTempo) + "\t" + str(songDanceability) + "\t" +
                         str(songDuration) + "\t" + str(songEnergy) + "\t" +
                         str(songKey) + "\t" + str(songLoudness) + "\t" +
                         str(songMode) + "\t" + str(songTimeSignature) +
                         "\t\n")

    artistName = GETTERS.get_artist_name(songHDF5File)
    artistFamiliarity = GETTERS.get_artist_familiarity(songHDF5File)
    artistTagsArray = GETTERS.get_artist_mbtags(songHDF5File)

    artistsTableFile.write(artistID + "\t" + artistName + "\t" +
                           str(artistFamiliarity) + "\t\n")

    if len(songName) > maximumSongNameLen:
        maximumSongNameLen = len(songName)

    if len(songAlbum) > maximumAlbumNameLen:
        maximumAlbumNameLen = len(songAlbum)

    if len(artistName) > maximumArtistNameLen:
        maximumArtistNameLen = len(artistName)

    for artistTag in artistTagsArray:
        if artistTag in allowedTagsSet:

            artistsTagsTableFile.write(artistID + "\t" + artistTag + "\t\n")
            if artistTag not in tagsSet:
                tagsTableFile.write(artistTag + "\t\n")
                tagsSet.add(artistTag)
            if len(artistTag) > maximumArtistTagLen:
                maximumArtistTagLen = len(artistTag)

    similarArtists = GETTERS.get_similar_artists(songHDF5File)

    for similarArtist in similarArtists:
        similarArtistsPairsList.add((artistID, similarArtist))

    artistsIDsSet.add(artistID)
    artistsNamesSet.add(artistName)

    songHDF5File.close()
                 'dance and electronica','electronic']

# relevant metadata for all EDM songs found in the MSD
all_song_data = {}
pitch_segs_data = []
count = 0
start_time = time.time()
''' preprocessing before actually running the dirichlet process takes much more time 
than running the dirichlet process on data '''

for root, dirs, files in os.walk(basedir):
    files = glob.glob(os.path.join(root, '*' + ext))
    for f in files:
        h5 = hdf5_getters.open_h5_file_read(f)
        # if year unknown, throw out sample
        if hdf5_getters.get_year(h5) == 0:
            h5.close()
            continue
        if any(tag in str(hdf5_getters.get_artist_mbtags(h5))
               for tag in target_genres):
            print 'found electronic music song at {0} seconds'.format(
                time.time() - start_time)
            count += 1
            chord_changes = [0 for i in range(0, 192)]
            segments_pitches_old = hdf5_getters.get_segments_pitches(h5)
            segments_pitches_old_smoothed = []
            smoothing_factor = max(
                3,
                round(
                    len(segments_pitches_old) * 60.0 /
                    (hdf5_getters.get_tempo(h5) *
def get_fields(files):
    tracks = []
    counts = {}
    field_counts = []
    for file in files:
        h5 = hdf5_getters.open_h5_file_read(file)
        t = {}
        t['artist_familiarity'] = hdf5_getters.get_artist_familiarity(
            h5)  # estimation
        t['artist_hotttnesss'] = hdf5_getters.get_artist_hotttnesss(
            h5)  # estimation
        t['artist_name'] = hdf5_getters.get_artist_name(h5)  # artist name
        t['release'] = hdf5_getters.get_release(h5)  # album name
        t['title'] = hdf5_getters.get_title(h5)  # title
        t['len_similar_artists'] = len(
            hdf5_getters.get_similar_artists(h5))  # number of similar artists
        t['analysis_sample_rate'] = hdf5_getters.get_analysis_sample_rate(
            h5)  # sample rate of the audio used ?????????
        t['duration'] = hdf5_getters.get_duration(h5)  # seconds
        t['key'] = hdf5_getters.get_key(h5)  # key the song is in
        t['key_confidence'] = hdf5_getters.get_key_confidence(
            h5)  # confidence measure
        t['loudness'] = hdf5_getters.get_loudness(h5)  # overall loudness in dB
        t['mode_confidence'] = hdf5_getters.get_mode_confidence(
            h5)  # confidence measure
        t['start_of_fade_out'] = hdf5_getters.get_start_of_fade_out(
            h5)  # time in sec
        t['tempo'] = hdf5_getters.get_tempo(h5)  # estimated tempo in BPM
        t['time_signature'] = hdf5_getters.get_time_signature(
            h5)  # estimate of number of beats per bar, e.g. 4
        t['year'] = hdf5_getters.get_year(
            h5)  # song release year from MusicBrainz or 0

        timbre = hdf5_getters.get_segments_timbre(
            h5)  # 2D float array, texture features (MFCC+PCA-like)
        t['segments_timbre'] = timbre
        t['timbre_avg'] = timbre.mean(axis=0)  # list of 12 averages
        cov_mat_timbre = np.cov(timbre, rowvar=False)
        cov_timbre = []
        for i in range(len(cov_mat_timbre)):
            for j in range(len(cov_mat_timbre) - i):
                cov_timbre.append(cov_mat_timbre[i][j])
        t['timbre_cov'] = cov_timbre  # list of 78 covariances

        pitch = hdf5_getters.get_segments_pitches(
            h5)  # 2D float array, chroma feature, one value per note
        t['segments_pitch'] = pitch
        t['pitch_avg'] = pitch.mean(axis=0)  # list of 12 averages
        cov_mat_pitch = np.cov(pitch, rowvar=False)
        cov_pitch = []
        for i in range(len(cov_mat_pitch)):
            for j in range(len(cov_mat_pitch) - i):
                cov_pitch.append(cov_mat_timbre[i][j])
        t['pitch_cov'] = cov_pitch  # list of 78 covariances

        # seg_pitch = hdf5_getters.get_segments_pitches(h5)  # 2D float array, chroma feature, one value per note
        # print(seg_pitch.shape)

        # t['artist_latitude'] = hdf5_getters.get_artist_latitude(h5)  # float, ????????????????????????????????????????
        # t['artist_longitude'] = hdf5_getters.get_artist_longitude(h5)  # float, ??????????????????????????????????????
        # t['artist_location'] = hdf5_getters.get_artist_location(h5)  # location name
        # t['song_hotttnesss'] = hdf5_getters.get_song_hotttnesss(h5)  # estimation
        # t['danceability'] = hdf5_getters.get_danceability(h5)  # estimation
        # t['end_of_fade_in'] = hdf5_getters.get_end_of_fade_in(h5)  # seconds at the beginning of the song
        # t['energy'] = hdf5_getters.get_energy(h5)  # energy from listener point of view
        # t['mode'] = hdf5_getters.get_mode(h5)  # major or minor
        # t['time_signature_confidence'] = hdf5_getters.get_time_signature_confidence(h5)  # confidence measure
        # t['artist_mbtags_count'] = len(hdf5_getters.get_artist_mbtags_count(h5))  # array int, tag counts for musicbrainz tags
        # bad types or non arithmatic numbers
        '''
        # t['audio_md5'] = hdf5_getters.get_audio_md5(h5)  # hash code of the audio used for the analysis by The Echo Nest
        # t['artist_terms_weight'] = hdf5_getters.get_artist_terms_weight(h5)  # array float, echonest tags weight ?????
        # t['artist_terms_freq'] = hdf5_getters.get_artist_terms_freq(h5)  # array float, echonest tags freqs ??????????
        # t['artist_terms'] = hdf5_getters.get_artist_terms(h5)  # array string, echonest tags ?????????????????????????
        # t['artist_id'] = hdf5_getters.get_artist_id(h5)  # echonest id
        # t['artist_mbid'] = hdf5_getters.get_artist_mbid(h5)  # musicbrainz id
        # t['artist_playmeid'] = hdf5_getters.get_artist_playmeid(h5)  # playme id
        # t['artist_7digitalid'] = hdf5_getters.get_artist_7digitalid(h5)  # 7digital id
        # t['release_7digitalid'] = hdf5_getters.get_release_7digitalid(h5)  # 7digital id
        # t['song_id'] = hdf5_getters.get_song_id(h5)  # echonest id
        # t['track_7digitalid'] = hdf5_getters.get_track_7digitalid(h5)  # 7digital id
        # t['similar_artists'] = hdf5_getters.get_similar_artists(h5)  # string array of sim artist ids
        # t['track_id'] = hdf5_getters.get_track_id(h5)  # echonest track id
        # t['segments_start'] = hdf5_getters.get_segments_start(h5)  # array floats, musical events, ~ note onsets
        # t['segments_confidence'] = hdf5_getters.get_segments_confidence(h5)  # array floats, confidence measure
        # t['segments_pitches'] = hdf5_getters.get_segments_pitches(h5)  # 2D float array, chroma feature, one value per note
        # t['segments_timbre'] = hdf5_getters.get_segments_timbre(h5)  # 2D float array, texture features (MFCC+PCA-like)
        # t['segments_loudness_max'] = hdf5_getters.get_segments_loudness_max(h5)  # float array, max dB value
        # t['segments_loudness_max_time'] = hdf5_getters.get_segments_loudness_max_time(h5)  # float array, time of max dB value, i.e. end of attack
        # t['segments_loudness_start'] = hdf5_getters.get_segments_loudness_start(h5)  # array float, dB value at onset
        # t['sections_start'] = hdf5_getters.get_sections_start(h5)  # array float, largest grouping in a song, e.g. verse
        # t['sections_confidence'] = hdf5_getters.get_sections_confidence(h5)  # array float, confidence measure
        # t['beats_start'] = hdf5_getters.get_beats_start(h5)  # array float, result of beat tracking
        # t['beats_confidence'] = hdf5_getters.get_beats_confidence(h5)  # array float, confidence measure
        # t['bars_start'] = hdf5_getters.get_bars_start(h5)  # array float, beginning of bars, usually on a beat
        # t['bars_confidence'] = hdf5_getters.get_bars_confidence(h5)  # array float, confidence measure
        # t['tatums_start'] = hdf5_getters.get_tatums_start(h5)  # array float, smallest rythmic element
        # t['tatums_confidence'] = hdf5_getters.get_tatums_confidence(h5)  # array float, confidence measure
        # t['artist_mbtags'] = hdf5_getters.get_artist_mbtags(h5)  # array string, tags from musicbrainz.org 
        '''
        h5.close()

        for key, value in t.items():
            if isinstance(value, float) and math.isnan(value):
                pass
            if type(value) is np.ndarray:
                if key in counts.keys():
                    counts[key] += 1
                else:
                    counts[key] = 1
            elif value:
                if key in counts.keys():
                    counts[key] += 1
                else:
                    counts[key] = 1
            elif key not in counts.keys():
                counts[key] = 0

        count = 0
        for key, value in t.items():
            if isinstance(value, float) and math.isnan(value):
                pass
            elif type(value) is np.ndarray:
                count += 1
            elif value:
                count += 1
        field_counts.append(count)

        # progress bar
        if num_of_tracks >= 100:
            i = files.index(file) + 1
            scale = num_of_tracks / 100
            if i % math.ceil(len(files) * .05) == 0:
                sys.stdout.write('\r')
                # the exact output you're looking for:
                sys.stdout.write("Loading dataframe: [%-100s] %d%%" %
                                 ('=' * int(i // scale), 1 / scale * i))
                sys.stdout.flush()
                time.sleep(.01)

        tracks.append(t)
    print()
    return tracks, counts, field_counts
Example #50
0
        song_id = hdf5_getters.get_song_id(h5, songidx=row).decode('UTF-8')
        #         artist = hdf5_getters.get_artist_name(h5,songidx=row).decode('UTF-8')
        #         title= hdf5_getters.get_title(h5,songidx=row)#.decode('UTF-8')
        #         artist = "".join(c for c in unicodedata.normalize('NFD', str(artist.decode("utf8"))) if unicodedata.category(c) != "Mn")
        #         title = "".join(c for c in unicodedata.normalize('NFD', str(title.decode("utf8"))) if unicodedata.category(c) != "Mn")

        #single number features
        danceability = hdf5_getters.get_danceability(h5, songidx=row)
        duration = hdf5_getters.get_duration(h5, songidx=row)
        energy = hdf5_getters.get_energy(h5, songidx=row)
        loudness = hdf5_getters.get_loudness(h5, songidx=row)
        musicalKey = hdf5_getters.get_key(h5, songidx=row)
        mode = hdf5_getters.get_mode(h5, songidx=row)
        tempo = hdf5_getters.get_tempo(h5, songidx=row)
        time_signature = hdf5_getters.get_time_signature(h5, songidx=row)
        year = hdf5_getters.get_year(h5, songidx=row)
        song_hottness = hdf5_getters.get_song_hotttnesss(h5, songidx=row)
        end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5, songidx=row)
        start_of_fade_out = hdf5_getters.get_start_of_fade_out(h5, songidx=row)

        #timestamp features
        #take last element and divide by length to get beats/unit time, segments/unit_time
        bars_start = hdf5_getters.get_bars_start(h5, songidx=row)
        beats_start = hdf5_getters.get_beats_start(h5, songidx=row)
        sections_start = hdf5_getters.get_sections_start(h5, songidx=row)
        tatums_start = hdf5_getters.get_tatums_start(h5, songidx=row)
        segments_start = hdf5_getters.get_segments_start(h5, songidx=row)
        if len(bars_start) == 0: bars_start = 0.
        else: bars_start = bars_start[-1] / len(bars_start)
        if len(beats_start) == 0: beats_start = 0.
        else: beats_start = beats_start[-1] / len(beats_start)
            if name.endswith(".h5"):
                if process_completion % 10000 == 0:
                    print "done :", process_completion/10000.0, "%"
                    process_completion += 1
                else:
                    process_completion += 1

                tempPath = os.path.abspath(os.path.join(root,name))
                h5file = hdf5_getters.open_h5_file_read(tempPath)

                #data extract 1
                track_id_str = hdf5_getters.get_track_id(h5file)
                song_id_str = hdf5_getters.get_song_id(h5file)
                album_name_str = unicodedata.normalize('NFKD', unicode(hdf5_getters.get_release(h5file),encoding='ASCII',errors='ignore'))
                album_name_str = str(album_name_str).replace(","," ").replace("'","").replace("-"," ").replace("("," ").replace(")"," ").replace("/"," ").replace("\\"," ")
                year_str = str(hdf5_getters.get_year(h5file))

                counter += 1
                file_io_counter += 1
                if file_io_counter % 100 == 0:
                    if counter == 1:
                        my_array_data_extract_1 = numpy.array([track_id_str, song_id_str, album_name_str, year_str])
                    else :
                        my_array_data_extract_1 = numpy.vstack((my_array_data_extract_1,numpy.array([track_id_str, song_id_str, album_name_str, year_str])))

                    f_handle = file('msd_data_extract_1.bin','a')
                    numpy.savetxt(f_handle, my_array_data_extract_1, delimiter='|',fmt='%s')
                    f_handle.close()

                    del my_array_data_extract_1
                    counter = 0
def getData(starting_point):

    starting = starting_point * 10000
    files = glob.glob('/mnt/snap/data/*/*/*/*.h5')

    file_one_round = files[starting:starting + 10000]

    artist_ids = []

    song_beats_persecond = []
    song_duration = []
    song_end_fade_in = []
    song_start_fade_out = []
    song_key = []
    song_loudness = []

    song_segments_loudness_max = []
    song_segments_loudness_min = []
    song_segments_loudness_med = []

    song_segments_loudness_time_max = []
    song_segments_loudness_time_min = []
    song_segments_loudness_time_med = []

    song_mode = []
    song_sections_start = []
    song_pitches = []
    song_timbre = []
    song_tempo = []
    song_time_signature = []
    song_title = []
    artist_name = []
    year = []

    idx = np.triu_indices(12)

    #count = 1

    for f in file_one_round:
        h5 = HDF5.open_h5_file_read(f)

        songYear = g.get_year(h5)
        if songYear < 1990:
            continue

        artist_id = g.get_artist_id(h5)
        song_beat = (g.get_beats_start(h5)).tolist()
        songDuration = g.get_duration(h5)
        song_beat_persecond = float(len(song_beat)) / songDuration

        song_end_fadein = g.get_end_of_fade_in(h5)
        song_start_fadeout = g.get_start_of_fade_out(h5)
        songKey = g.get_key(h5)
        songLoudness = g.get_loudness(h5)

        song_loudness_max = (g.get_segments_loudness_max(h5)) // 10
        song_loudness_antilog = np.power(10, song_loudness_max)
        song_segmentsLoudness_max = np.amax(song_loudness_antilog)
        song_segmentsLoudness_min = np.amin(song_loudness_antilog)
        song_segmentsLoudness_med = np.median(song_loudness_antilog)

        song_segmentsLoudness_max_time = (
            g.get_segments_loudness_max_time(h5)).tolist()
        song_loudness_time = np.multiply(song_loudness_antilog,
                                         song_segmentsLoudness_max_time)
        song_segmentsLoudnessTime_max = np.amax(song_loudness_time)
        song_segmentsLoudnessTime_min = np.amin(song_loudness_time)
        song_segmentsLoudnessTime_med = np.median(song_loudness_time)

        songMode = g.get_mode(h5)
        song_sectionsStart = (g.get_sections_start(h5)).tolist()
        songPitches = g.get_segments_pitches(h5)
        songPitches_cov = np.cov(songPitches, rowvar=False)
        songPitches_mean = np.mean(songPitches, axis=0)
        #print(songPitches_cov.shape)
        songTimbre = g.get_segments_timbre(h5)
        songTimbre_cov = np.cov(songTimbre, rowvar=False)
        songTimbre_mean = np.mean(songTimbre, axis=0)
        #print(songTimbre_cov.shape)
        songTempo = g.get_tempo(h5)
        songTime_signature = g.get_time_signature(h5)
        songTitle = g.get_title(h5)
        artistName = g.get_artist_name(h5)

        artist_ids.append(artist_id)

        song_beats_persecond.append(song_beat_persecond)
        song_duration.append(songDuration)
        song_end_fade_in.append(song_end_fadein)
        song_start_fade_out.append(song_start_fadeout)
        song_key.append(songKey)
        song_loudness.append(songLoudness)

        song_segments_loudness_max.append(song_segmentsLoudness_max)
        song_segments_loudness_min.append(song_segmentsLoudness_min)
        song_segments_loudness_med.append(song_segmentsLoudness_med)

        song_segments_loudness_time_max.append(song_segmentsLoudnessTime_max)
        song_segments_loudness_time_min.append(song_segmentsLoudnessTime_min)
        song_segments_loudness_time_med.append(song_segmentsLoudnessTime_med)

        song_mode.append(songMode)
        song_sections_start.append(song_sectionsStart)
        pitches_mean_cov = (songPitches_cov[idx]).tolist()
        pitches_mean_cov.extend((songPitches_mean).tolist())
        song_pitches.append(pitches_mean_cov)
        timbre_mean_cov = (songTimbre_cov[idx]).tolist()
        timbre_mean_cov.extend((songTimbre_mean).tolist())
        song_timbre.append(timbre_mean_cov)
        song_tempo.append(songTempo)
        song_time_signature.append(songTime_signature)
        song_title.append(songTitle)
        artist_name.append(artistName)
        year.append(songYear)

        #print(count)
        #count = count + 1
        h5.close()

    #def createDictsFrom2DArray(dictionary, colName, featureList):
    #	for i in range(0,12):
    #		dictionary[colName+str(i)] = featureList[i]
    #i = 1
    #for t in itertools.izip_longest(*featureList):
    #	dictionary[colName+str(i)] = t
    #	i = i + 1
    #	return dictionary

    data = collections.OrderedDict()

    data['year'] = year
    data['artist_name'] = artist_name
    data['artist_id'] = artist_ids
    data['song_title'] = song_title
    data['song_beats_persecond'] = song_beats_persecond
    data['song_duration'] = song_duration
    data['song_end_fade_in'] = song_end_fade_in
    data['song_start_fade_out'] = song_start_fade_out
    data['song_key'] = song_key
    data['song_loudness'] = song_loudness

    data['song_loudness_max'] = song_segments_loudness_max
    data['song_loudness_min'] = song_segments_loudness_min
    data['song_loudness_med'] = song_segments_loudness_med

    data['song_loudness_time_max'] = song_segments_loudness_time_max
    data['song_loudness_time_min'] = song_segments_loudness_time_min
    data['song_loudness_time_med'] = song_segments_loudness_time_med

    data['song_mode'] = song_mode
    data['song_tempo'] = song_tempo
    data['song_time_signature'] = song_time_signature
    data = createDictsFrom1DArray(data, 'pitches', song_pitches)
    data = createDictsFrom1DArray(data, 'timbre', song_timbre)

    data = createDictsFrom1DArray(data, 'sections_start', song_sections_start)

    df = pd.DataFrame(data)
    print('before return ' + str(starting_point))

    return df
Example #53
0
                    cursor.execute("INSERT INTO artist_genres VALUES ('" + artist_id + "','" + term + "')")
            for tag in mbtags:
                tag = tag.replace("'","")
                cursor.execute("SELECT * FROM artist_genres WHERE artist_id='" + artist_id + "' AND genre ='" + tag + "'")
                if cursor.rowcount != 1:
                    cursor.execute("INSERT INTO artist_genres VALUES ('" + artist_id + "','" + tag + "')")

            ''' Store track tuples '''

            track_id = h.get_track_id(h5,0)
            track_title = h.get_title(h5,0)
            track_title = track_title.replace("'","")
            track_album = h.get_release(h5,0)
            track_album = track_album.replace("'","")
            track_duration = str(h.get_duration(h5,0))
            track_year = str(h.get_year(h5,0))

            cursor.execute("SELECT * FROM track WHERE track_id = '" + track_id  + "'")
            rs = cursor.fetchall()
            if cursor.rowcount != 1:
                cursor.execute("INSERT INTO track VALUES ('" + track_id + "','" + track_title + "','" + artist_id  + "','"  + artist_name + "','" + track_album + "'," + track_duration + "," + track_year  + ");")
                      
            ''' Store track_analysis tuples '''
            print ("Track ID: " + h.get_track_id(h5,0))
            track_tempo = str(h.get_tempo(h5,0))
            track_key = str(h.get_key(h5,0))
            track_danceability = str(h.get_danceability(h5,0))
            if track_danceability == "nan":
                track_danceability = "0.0"
            track_hottness = str(h.get_song_hotttnesss(h5,0))
            if track_hottness == "nan":
Example #54
0
def convert_to_csv():
    i = 0

    data = []
    target = []
    count = 0


    with open('data_timbre.csv', 'w+') as f:
        with open('target_timbre.csv', 'w+') as f2:
            writer = csv.writer(f)
            target_writer = csv.writer(f2)

            for root, dirs, files in os.walk(msd_subset_data_path):
                files = glob.glob(os.path.join(root,'*.h5'))
                for f in sorted(files):
                    try: # Opening is very prone to causing exceptions, we'll just skip file if exception is thrown
                        h5 = getter.open_h5_file_read(f)
                        year = getter.get_year(h5)
                        if year:
                            count +=1

                            analysis_file = open('current_analysis_status.txt','a')
                            update = "Currently at file name: " + str(f) + " and at number " + str(count) + "\n"
                            analysis_file.write(update)
                            print update
                            analysis_file.close()

                            target.append([year])
                            row = []
                            
                            timbre = getter.get_segments_timbre(h5)
                            segstarts = getter.get_segments_start(h5)
                            btstarts = getter.get_beats_start(h5)
                            duration = getter.get_duration(h5)
                            end_of_fade_in = getter.get_end_of_fade_in(h5)
                            key = getter.get_key(h5)
                            key_confidence = getter.get_key_confidence(h5)
                            loudness = getter.get_loudness(h5)
                            start_of_fade_out = getter.get_start_of_fade_out(h5)
                            tempo = getter.get_tempo(h5)
                            time_signature = getter.get_time_signature(h5)
                            time_signature_confidence = getter.get_time_signature_confidence(h5)

                            h5.close() # VERY IMPORTANT

                            segstarts = np.array(segstarts).flatten()
                            btstarts = np.array(btstarts).flatten()

                            bttimbre = align_feats(timbre.T, segstarts, btstarts, duration, end_of_fade_in, key, key_confidence, loudness, start_of_fade_out, tempo, time_signature, time_signature_confidence)

                            if bttimbre is None:
                                continue # Skip this track, some features broken

                            npicks, winsize, finaldim = 12, 12, 144  # Calculated by 12 * 12. 12 is fixed as number of dimensions.
                            processed_feats = extract_and_compress(bttimbre, npicks, winsize, finaldim)
                            n_p_feats = processed_feats.shape[0]

                            if processed_feats is None:
                                continue # Skip this track, some features broken

                            row = processed_feats.flatten()
                            if len(row) != 12*144: # 12 dimensions * 144 features per dimension
                                continue # Not enough features

                            year_row = np.array([year])

                            if row.any() and year_row.any():
                                writer.writerow(row)
                                target_writer.writerow(year_row)

                            i+=1

                        else:
                            h5.close()

                    except Exception:
                        pass



    print 'Finished!'

    analysis_file = open('current_analysis_status.txt','a')
    analysis_file.write('Done!')
    analysis_file.close()

    return 
Example #55
0
            # Get danceability
            danceability = hdf5_getters.get_danceability(h5)

            # Get duration
            duration = hdf5_getters.get_duration(h5)

            # Get energy
            #*****useless... column is filled with 0's?
            energy = hdf5_getters.get_energy(h5)

            # Get loudness
            loudness = hdf5_getters.get_loudness(h5)

            # Get year
            year = hdf5_getters.get_year(h5)

            # Get tempo
            tempo = hdf5_getters.get_tempo(h5)

            #########################################################

            # Get analysis sample rate
            analysis_rate = hdf5_getters.get_analysis_sample_rate(h5)

            # Get end of fade in
            end_of_fade_in = hdf5_getters.get_end_of_fade_in(h5)

            # Get key
            key = hdf5_getters.get_key(h5)
Example #56
0
def classify(h5):
	output_array={}
	# duration
	duration=hdf5_getters.get_duration(h5)
	output_array["duration"]=duration	### ADDED VALUE TO ARRAY
	# number of bars
	bars=hdf5_getters.get_bars_start(h5)
	num_bars=len(bars)
	output_array["num_bars"]=num_bars	### ADDED VALUE TO ARRAY
	# mean and variance in bar length
	bar_length=numpy.ediff1d(bars)
	variance_bar_length=numpy.var(bar_length)
	output_array["variance_bar_length"]=variance_bar_length	### ADDED VALUE TO ARRAY
	# number of beats
	beats=hdf5_getters.get_beats_start(h5)
	num_beats=len(beats)
	output_array["num_beats"]=num_beats	### ADDED VALUE TO ARRAY
	# mean and variance in beats length
	beats_length=numpy.ediff1d(beats)
	variance_beats_length=numpy.var(bar_length)
	output_array["variance_beats_length"]=variance_beats_length	### ADDED VALUE TO ARRAY
	# danceability
	danceability=hdf5_getters.get_danceability(h5)
	output_array["danceability"]=danceability	### ADDED VALUE TO ARRAY
	# end of fade in
	end_of_fade_in=hdf5_getters.get_end_of_fade_in(h5)
	output_array["end_of_fade_in"]=end_of_fade_in	### ADDED VALUE TO ARRAY
	# energy
	energy=hdf5_getters.get_energy(h5)
	output_array["energy"]=energy	### ADDED VALUE TO ARRAY
	# key
	key=hdf5_getters.get_key(h5)
	output_array["key"]=int(key)	### ADDED VALUE TO ARRAY
	# loudness
	loudness=hdf5_getters.get_loudness(h5)
	output_array["loudness"]=loudness	### ADDED VALUE TO ARRAY
	# mode
	mode=hdf5_getters.get_mode(h5)
	output_array["mode"]=int(mode)	### ADDED VALUE TO ARRAY
	# number sections
	sections=hdf5_getters.get_sections_start(h5)
	num_sections=len(sections)
	output_array["num_sections"]=num_sections	### ADDED VALUE TO ARRAY
	# mean and variance in sections length
	sections_length=numpy.ediff1d(sections)
	variance_sections_length=numpy.var(sections)
	output_array["variance_sections_length"]=variance_sections_length	### ADDED VALUE TO ARRAY
	# number segments
	segments=hdf5_getters.get_segments_start(h5)
	num_segments=len(segments)
	output_array["num_segments"]=num_segments	### ADDED VALUE TO ARRAY
	# mean and variance in segments length
	segments_length=numpy.ediff1d(segments)
	variance_segments_length=numpy.var(segments)
	output_array["variance_segments_length"]=variance_segments_length	### ADDED VALUE TO ARRAY
	# segment loudness max
	segment_loudness_max_array=hdf5_getters.get_segments_loudness_max(h5)
	segment_loudness_max_time_array=hdf5_getters.get_segments_loudness_max_time(h5)
	segment_loudness_max_index=0
	for i in range(len(segment_loudness_max_array)):
		if segment_loudness_max_array[i]>segment_loudness_max_array[segment_loudness_max_index]:
			segment_loudness_max_index=i
	segment_loudness_max=segment_loudness_max_array[segment_loudness_max_index]
	segment_loudness_max_time=segment_loudness_max_time_array[segment_loudness_max_index]
	output_array["segment_loudness_max"]=segment_loudness_max	### ADDED VALUE TO ARRAY
	output_array["segment_loudness_time"]=segment_loudness_max_time	### ADDED VALUE TO ARRAY
			
	# POSSIBLE TODO: use average function instead and weight by segment length
	# segment loudness mean (start)
	segment_loudness_array=hdf5_getters.get_segments_loudness_start(h5)
	segment_loudness_mean=numpy.mean(segment_loudness_array)
	output_array["segment_loudness_mean"]=segment_loudness_mean	### ADDED VALUE TO ARRAY
	# segment loudness variance (start)
	segment_loudness_variance=numpy.var(segment_loudness_array)
	output_array["segment_loudness_variance"]=segment_loudness_variance	### ADDED VALUE TO ARRAY
	# segment pitches
	segment_pitches_array=hdf5_getters.get_segments_pitches(h5)
	segment_pitches_mean=numpy.mean(segment_pitches_array,axis=0).tolist()
	output_array["segment_pitches_mean"]=segment_pitches_mean
	# segment pitches variance (start)
	segment_pitches_variance=numpy.var(segment_pitches_array,axis=0).tolist()
	output_array["segment_pitches_variance"]=segment_pitches_variance
	# segment timbres
	segment_timbres_array=hdf5_getters.get_segments_timbre(h5)
	segment_timbres_mean=numpy.mean(segment_timbres_array,axis=0).tolist()
	output_array["segment_timbres_mean"]=segment_timbres_mean
	# segment timbres variance (start)
	segment_timbres_variance=numpy.var(segment_timbres_array,axis=0).tolist()
	output_array["segment_timbres_variance"]=segment_timbres_variance
	# hotttnesss
	hottness=hdf5_getters.get_song_hotttnesss(h5,0)
	output_array["hottness"]=hottness	### ADDED VALUE TO ARRAY
	# duration-start of fade out
	start_of_fade_out=hdf5_getters.get_start_of_fade_out(h5)
	fade_out=duration-start_of_fade_out
	output_array["fade_out"]=fade_out	### ADDED VALUE TO ARRAY
	# tatums
	tatums=hdf5_getters.get_tatums_start(h5)
	num_tatums=len(tatums)
	output_array["num_tatums"]=num_tatums	### ADDED VALUE TO ARRAY
	# mean and variance in tatums length
	tatums_length=numpy.ediff1d(tatums)
	variance_tatums_length=numpy.var(tatums_length)
	output_array["variance_tatums_length"]=variance_tatums_length	### ADDED VALUE TO ARRAY
	# tempo
	tempo=hdf5_getters.get_tempo(h5)
	output_array["tempo"]=tempo	### ADDED VALUE TO ARRAY
	# time signature
	time_signature=hdf5_getters.get_time_signature(h5)
	output_array["time_signature"]=int(time_signature)	### ADDED VALUE TO ARRAY
	# year
	year=hdf5_getters.get_year(h5)
	output_array["year"]=int(year)	### ADDED VALUE TO ARRAY
	# artist terms
	artist_terms=hdf5_getters.get_artist_terms(h5,0)
	output_array["artist_terms"]=artist_terms.tolist()
	artist_terms_freq=hdf5_getters.get_artist_terms_freq(h5,0)
	output_array["artist_terms_freq"]=artist_terms_freq.tolist()
	artist_name=hdf5_getters.get_artist_name(h5,0)
	output_array["artist_name"]=artist_name
	artist_id=hdf5_getters.get_artist_id(h5,0)
	output_array["artist_id"]=artist_id
	# title
	title=hdf5_getters.get_title(h5,0)
	output_array["title"]=title

	return output_array
Example #57
0
def main():
    dataset_dir = sys.argv[1]
    global feat
    Create_BoW(dataset_dir)
    Size_BoW = Index_BoW(Bag_Words)
    count = Frequency(Size_BoW, dataset_dir)
    Size_BoW = Prune(count)
    Lablify()
    print "Forming Dataset..."
    listing1 = os.listdir(dataset_dir)
    for a in listing1:
        listing2 = os.listdir(dataset_dir+a+'/')
        for b in listing2:
            listing3 = os.listdir(dataset_dir+a+'/'+b+'/')
            for c in listing3:
                listing4 = os.listdir(dataset_dir+a+'/'+b+'/'+c+'/')
                for d in listing4:
                    h5 = hdf5_getters.open_h5_file_read(dataset_dir+a+'/'+b+'/'+c+'/'+d)
                    feat = []
                    temp = hdf5_getters.get_artist_hotttnesss(h5)
                    if (math.isnan(temp) or temp==0.0):
                        h5.close()
                        continue
                    feat.append(temp)

                    temp = hdf5_getters.get_artist_familiarity(h5)
                    if (math.isnan(temp) or temp==0.0):
                        h5.close()
                        continue
                    feat.append(temp)

                    temp = hdf5_getters.get_bars_confidence(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_beats_confidence(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    mm = np.mean(temp)
                    vv = np.var(temp)
                    if mm==0.0 and vv==0.0:
                    	h5.close()
                        continue
                    feat.append(mm)
                    feat.append(vv)


                    feat.append(hdf5_getters.get_duration(h5))

                    temp = hdf5_getters.get_end_of_fade_in(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)


                    feat.append(hdf5_getters.get_key(h5))

                    temp = hdf5_getters.get_key_confidence(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)

                    temp = hdf5_getters.get_loudness(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)

                    feat.append(hdf5_getters.get_mode(h5))

                    temp = hdf5_getters.get_mode_confidence(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)

                    temp = hdf5_getters.get_sections_confidence(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_segments_confidence(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_segments_loudness_max(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_segments_loudness_max_time(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_segments_pitches(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_segments_timbre(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_start_of_fade_out(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)

                    temp = hdf5_getters.get_tatums_confidence(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    MeanVar(temp)

                    temp = hdf5_getters.get_tempo(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)

                    feat.append(hdf5_getters.get_time_signature(h5))

                    temp = hdf5_getters.get_time_signature_confidence(h5)
                    if (math.isnan(temp)):
                        h5.close()
                        continue
                    feat.append(temp)

                    temp = hdf5_getters.get_year(h5)
                    if temp == 0:
                        h5.close()
                        continue
                    feat.append(temp)


                    temp = hdf5_getters.get_artist_terms(h5)
                    if temp.size == 0:
                        h5.close()
                        continue
                    temp_ = hdf5_getters.get_artist_terms_weight(h5)
                    if temp_.size == 0:
                        continue
                    for j in Final_BoW:
                        if j in temp:
                            x = np.where(temp==j)
                            x = x[0][0]
                            feat.append(temp_[x])
                        else:
                            x = 0.0
                            feat.append(x)

                    temp = hdf5_getters.get_song_hotttnesss(h5)
                    if (math.isnan(temp) or temp==0.0):
                        h5.close()
                        continue
                    hott = 0
                    if temp >=0.75:
                        hott = 1
                    elif temp >=0.40 and temp <0.75:
                        hott = 2
                    else:
                        hott = 3
                    feat.append(hott)

                    h5.close()


                    count = 1
                    f=open('MSD_DATASET.txt', 'a')
                    outstring=''
                    cnt = 0
                    feat_size = len(feat)
                    for i in feat:
                        cnt+=1
                        outstring+=str(i)
                        if (cnt!=feat_size):
                            outstring+=','
                    outstring+='\n'
                    f.write(outstring)
                    f.close()
		score = row[4]

		all_chart_info[(artist,title)] = score;
"""

print "Done loading chart info"

song_list = []
j = 0

for i in range(0, numSongs):
	j += 1
	#print 
	track = {}
	#Handle each one
	year = h5get.get_year(h5, i)
	if year < 1980 or year > 2010:
		continue;

	song = Song()
	#song.year = year
	#song.hotness = h5get.get_song_hotttnesss(h5, i)

	#print "Hotness: ", song.hotness;
	#if math.isnan(song.hotness):
	#	song.hotness = 0.0;

	song.artist = h5get.get_artist_name(h5, i)
	song.name = h5get.get_title(h5, i)
	#track['track'] = str(song.artist) + " " + str(song.name)
	#track['hotness'] = float(song.hotness)
def func_to_extract_features(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get artist ID and put it
    - close the file
    """
    global cntnan	
    global cntdanceability
    global listfeatures

    global listhotness
    global listyear
    global listloudness
    global listkey
    global listmode
    global listduration 

    cf = []
    h5 = GETTERS.open_h5_file_read(filename)
    nanfound = 0

    #Get target feature: song hotness

    #FEATURE 0
    song_hotness = GETTERS.get_song_hotttnesss(h5)
    if math.isnan(song_hotness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_hotness)

    #FEATURE 1
    #Get song loudness
    song_loudness = GETTERS.get_loudness(h5)
    
    if math.isnan(song_loudness):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_loudness)

    #FEATURE 2
    #Get key of the song
    song_key = GETTERS.get_key(h5)
    if math.isnan(song_key):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_key)

    #FEATURE 3
    #Get duration of the song
    song_duration = GETTERS.get_duration(h5)
    if math.isnan(song_duration):
       nanfound = 1
       cntnan = cntnan + 1
    else:
       cf.append(song_duration)

    #FEATURE 4-15
    #Get Average Pitch Class across all segments
    #Get the pitches (12 pitches histogram for each segment)
    pitches = GETTERS.get_segments_pitches(h5)
    M = np.mat(pitches)
    meanpitches = M.mean(axis=0)
    pitches_arr = np.asarray(meanpitches)
    pitches_list = []
    for i in range(0,12):
	pitches_list.append(pitches_arr[0][i])

    cf.append(pitches_list)

    #FEATURE 16, 27
    #Get Average Timbre Class across all segments
    timbres = GETTERS.get_segments_timbre(h5)
    M = np.mat(timbres)
    meantimbres = M.mean(axis=0)
    timbre_arr = np.asarray(meantimbres)
    timbre_list = []
    for i in range(0,12):
	timbre_list.append(timbre_arr[0][i])

    cf.append(timbre_list)

    #FEATURE 28 
    #Get song year
    song_year = GETTERS.get_year(h5)
    if song_year == 0:
       nanfound = 1
       cntnan = cntnan + 1
    else:
      cf.append(song_year)

    #FEATURE 29 
    #Get song tempo
    song_tempo = GETTERS.get_tempo(h5)
    cf.append(song_tempo)

    #Feature 30
    #Get max loudness for each segment
    max_loudness_arr = GETTERS.get_segments_loudness_max(h5)
    start_loudness_arr = GETTERS.get_segments_loudness_start(h5)
    if nanfound == 0:
       cf.append(max(max_loudness_arr)-min(start_loudness_arr))

    #Feature 31
    artist_familiarity = GETTERS.get_artist_familiarity(h5)
    cf.append(artist_familiarity)

    #Feature 32
    song_title = GETTERS.get_title(h5)
    cf.append(song_title)

    #Featture 33
    artist_name = GETTERS.get_artist_name(h5)
    cf.append(artist_name)

    #Feature 34
    #location = GETTERS.get_artist_location(h5)
    #cf.append(location)

    #Tags
    artist_mbtags = GETTERS.get_artist_mbtags(h5)
    if not artist_mbtags.size:
       genre = "Unknown"
    else:
       artist_mbcount = np.array(GETTERS.get_artist_mbtags_count(h5))
       index_max = artist_mbcount.argmax(axis=0)
       genre = artist_mbtags[index_max]
       if genre == 'espa\xc3\xb1ol':
	  genre = "Unknown"

       cf.append(genre)

    if nanfound == 0:
       strlist = list_to_csv(cf)
       listfeatures.append(strlist)
       mydict.setdefault(artist_name,[]).append(song_hotness)
    h5.close()
Example #60
0
def main():
    outputFile = open('songs.csv', 'w')
    writer = csv.writer(outputFile)

    csvRowString = "song_number,artist_familiarity,artist_hotttnesss,artist_id,artist_mbid,artist_playmeid,artist_7digitalid,artist_latitude,artist_longitude,artist_location,artist_name,release,release_7digitalid,song_id,song_hotttnesss,title,track_7digitalid,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,year"

    outputFile.write(csvRowString + "\n")
    csvRowString = ""

    #################################################
    #Set the basedir here, the root directory from which the search
    #for files stored in a (hierarchical data structure) will originate
    basedir = "."  # "." As the default means the current directory
    ext = ".H5"  #Set the extension here. H5 is the extension for HDF5 files.
    #################################################

    #FOR LOOP
    songCount = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print(f)

            songH5File = hdf5_getters.open_h5_file_read(f)

            values = [
                songCount,
                hdf5_getters.get_artist_familiarity(songH5File),
                hdf5_getters.get_artist_hotttnesss(songH5File),
                hdf5_getters.get_artist_id(songH5File),
                hdf5_getters.get_artist_mbid(songH5File),
                hdf5_getters.get_artist_playmeid(songH5File),
                hdf5_getters.get_artist_7digitalid(songH5File),
                hdf5_getters.get_artist_latitude(songH5File),
                hdf5_getters.get_artist_longitude(songH5File),
                hdf5_getters.get_artist_location(songH5File),
                hdf5_getters.get_artist_name(songH5File),
                hdf5_getters.get_release(songH5File),
                hdf5_getters.get_release_7digitalid(songH5File),
                hdf5_getters.get_song_id(songH5File),
                hdf5_getters.get_song_hotttnesss(songH5File),
                hdf5_getters.get_title(songH5File),
                hdf5_getters.get_track_7digitalid(songH5File),
                hdf5_getters.get_analysis_sample_rate(songH5File),
                hdf5_getters.get_audio_md5(songH5File),
                hdf5_getters.get_danceability(songH5File),
                hdf5_getters.get_duration(songH5File),
                hdf5_getters.get_end_of_fade_in(songH5File),
                hdf5_getters.get_energy(songH5File),
                hdf5_getters.get_key(songH5File),
                hdf5_getters.get_key_confidence(songH5File),
                hdf5_getters.get_loudness(songH5File),
                hdf5_getters.get_mode(songH5File),
                hdf5_getters.get_mode_confidence(songH5File),
                hdf5_getters.get_start_of_fade_out(songH5File),
                hdf5_getters.get_tempo(songH5File),
                hdf5_getters.get_time_signature(songH5File),
                hdf5_getters.get_time_signature_confidence(songH5File),
                hdf5_getters.get_track_id(songH5File),
                hdf5_getters.get_year(songH5File)
            ]
            songH5File.close()
            songCount = songCount + 1

            writer.writerow(values)

    outputFile.close()