def feat_from_file(path):
    """
    Extract a list of features in an array, already converted to string
    """
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    # basic info
    feats.append(GETTERS.get_track_id(h5))
    feats.append(GETTERS.get_artist_name(h5).decode().replace(',', ''))
    feats.append(GETTERS.get_title(h5).decode().replace(',', ''))
    feats.append(GETTERS.get_loudness(h5))
    feats.append(GETTERS.get_tempo(h5))
    feats.append(GETTERS.get_time_signature(h5))
    feats.append(GETTERS.get_key(h5))
    feats.append(GETTERS.get_mode(h5))
    feats.append(GETTERS.get_duration(h5))
    # timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre, axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre, axis=0)
    for k in var_timbre:
        feats.append(k)
    # done with h5 file
    h5.close()
    # makes sure we return strings
    feats = map(lambda x: str(x), feats)
    return feats
Esempio n. 2
0
    def ingest(self):
        # Browse MSD summary file
        es_bulk_docs = {}
        msd_id = ""
        sng_idx = 0

        for h5_fd, sng_idx in self.track_generator.get_track():
            msd_doc = {}
            msd_id = hdf5_getters.get_track_id(h5_fd, sng_idx)

            for getter in getters:

                field_name = getter.split("get_")[-1]
                msd_field_name = "msd_" + field_name  # prefixed for ES storage
                try:
                    msd_field_value = hdf5_getters.__getattribute__(getter)(h5_fd, sng_idx)

                    # Type conversions
                    msd_field_value = Ingestor.convert_type(msd_field_value)

                    msd_doc[msd_field_name] = msd_field_value
                except AttributeError, e:
                    logger.debug("ERROR. AttributeError. {}".format(e))
                    pass

            es_bulk_docs[msd_id] = msd_doc

            # Ingest bulk if size is enough
            if len(es_bulk_docs) == es_bulk_size:
                logger.debug("{} files read. Bulk ingest.".format(sng_idx + 1))
                logger.debug("Last MSD id read: {}".format(msd_id))
                self.es_helper.ingest_to_es(es_bulk_docs)
                es_bulk_docs = {}
Esempio n. 3
0
def feat_from_file(path):
    
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    
    feats.append( GETTERS.get_track_id(h5) )
    feats.append( GETTERS.get_title(h5) )
    feats.append( GETTERS.get_artist_name(h5) )
    feats.append( GETTERS.get_year(h5) )
    feats.append( GETTERS.get_loudness(h5) )
    feats.append( GETTERS.get_tempo(h5) )
    feats.append( GETTERS.get_time_signature(h5) )
    feats.append( GETTERS.get_key(h5) )
    feats.append( GETTERS.get_mode(h5) )
    feats.append( GETTERS.get_duration(h5) )
    
    #timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre, axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre, axis=0)
    for k in var_timbre:
        feats.append(k)

    h5.close()
    
    return feats
Esempio n. 4
0
def feat_from_file(path):
    """
    Extract a list of features in an array, already converted to string
    """
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    # basic info
    feats.append( GETTERS.get_track_id(h5) )
    feats.append( GETTERS.get_artist_name(h5).replace(',','') )
    feats.append( GETTERS.get_title(h5).replace(',','') )
    feats.append( GETTERS.get_loudness(h5) )
    feats.append( GETTERS.get_tempo(h5) )
    feats.append( GETTERS.get_time_signature(h5) )
    feats.append( GETTERS.get_key(h5) )
    feats.append( GETTERS.get_mode(h5) )
    feats.append( GETTERS.get_duration(h5) )
    # timbre
    timbre = GETTERS.get_segments_timbre(h5)
    avg_timbre = np.average(timbre,axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre,axis=0)
    for k in var_timbre:
        feats.append(k)
    # done with h5 file
    h5.close()
    # makes sure we return strings
    feats = map(lambda x: str(x), feats)
    return feats
Esempio n. 5
0
def get_all_examples(basedir, genre_dict, ext='.h5'):
    """
    From a base directory, goes through all subdirectories,
    and grabs all songs and their features and puts them into a pandas dataframe 
    INPUT
       basedir    - base directory of the dataset
       genre_dict - a dictionary mapping track id to genre based tagraum dataset
       ext        - extension, .h5 by default
    RETURN
       dataframe containing all song examples
    """
    features_vs_genre = pd.DataFrame()

    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        # # count files
        # count += len(files)
        # apply function to all files
        for f in files:
            h5 = GETTERS.open_h5_file_read(f)
            song_id = GETTERS.get_track_id(h5).decode('utf-8')
            if (song_id in genre_dict):
                genre = genre_dict[song_id]
                year = GETTERS.get_year(h5)
                duration = GETTERS.get_duration(h5)
                end_of_fade_in = GETTERS.get_end_of_fade_in(h5)
                loudness = GETTERS.get_loudness(h5)
                song_hotttnesss = GETTERS.get_song_hotttnesss(h5)
                tempo = GETTERS.get_tempo(h5)
                key = GETTERS.get_key(h5)
                key_confidence = GETTERS.get_key_confidence(h5)
                mode = GETTERS.get_mode(h5)
                mode_confidence = GETTERS.get_mode_confidence(h5)
                time_signature = GETTERS.get_time_signature(h5)
                time_signature_confidence = GETTERS.get_time_signature_confidence(
                    h5)
                artist_name = GETTERS.get_artist_name(h5)
                title = GETTERS.get_title(h5)
                # length of sections_start array gives us number of start
                num_sections = len(GETTERS.get_sections_start(h5))
                num_segments = len(GETTERS.get_segments_confidence(h5))
                example = pd.DataFrame(
                    data=[
                        (artist_name, title, song_id, genre, year, key,
                         key_confidence, mode, mode_confidence, time_signature,
                         time_signature_confidence, duration, end_of_fade_in,
                         loudness, song_hotttnesss, tempo, num_sections)
                    ],
                    columns=[
                        'artist_name', 'title', 'song_id', 'genre', 'year',
                        'key', 'key_confidence', 'mode', 'mode_confidence',
                        'time_signature', 'time_signature_confidence',
                        'duration', 'end_of_fade_in', 'loudness',
                        'song_hotttnesss', 'tempo', 'num_segments'
                    ])
                features_vs_genre = features_vs_genre.append(example)
            h5.close()

    return features_vs_genre
Esempio n. 6
0
def func_to_desired_song_data(filename):
    h5 = GETTERS.open_h5_file_read(filename)
    track_id = GETTERS.get_track_id(h5)
    for song in random_songs:
        if song[0] == track_id:
            print("FOUND ONE!")
            title = replace_characters(GETTERS.get_title(h5))
            artist = replace_characters(GETTERS.get_artist_name(h5))
            year = GETTERS.get_year(h5)
            tempo = GETTERS.get_tempo(h5)
            key = GETTERS.get_key(h5)
            loudness = GETTERS.get_loudness(h5)
            energy = GETTERS.get_energy(h5)
            danceability = GETTERS.get_danceability(h5)
            time_signature = GETTERS.get_time_signature(h5)
            mode = GETTERS.get_mode(h5)
            hotttness = GETTERS.get_song_hotttnesss(h5)

            song_data = {
                'title': title,
                'artist': artist,
                'year': year,
                'tempo': tempo,
                'key': key,
                'loudness': loudness,
                'energy': energy,
                'danceability': danceability,
                'time_signature': time_signature,
                'mode': mode,
                'hotttness': hotttness
            }

            all_the_data.append(song_data)

    h5.close()
Esempio n. 7
0
def extract_features(filename):
    h5 = hdf5_getters.open_h5_file_read(filename)
    f = [None] * len(features)
    f[features.index('track_id')] = hdf5_getters.get_track_id(h5, 0).item()
    f[features.index('song_id')] = hdf5_getters.get_song_id(h5, 0).item()
    f[features.index('hotttnesss')] = hdf5_getters.get_artist_hotttnesss(
        h5, 0).item()
    f[features.index('danceability')] = hdf5_getters.get_danceability(
        h5, 0).item()
    f[features.index('duration')] = hdf5_getters.get_duration(h5, 0).item()
    f[features.index('key')] = hdf5_getters.get_key(h5, 0).item()
    f[features.index('energy')] = hdf5_getters.get_energy(h5, 0).item()
    f[features.index('loudness')] = hdf5_getters.get_loudness(h5, 0).item()
    f[features.index('year')] = hdf5_getters.get_year(h5, 0).item()
    f[features.index('time_signature')] = hdf5_getters.get_time_signature(
        h5, 0).item()
    f[features.index('tempo')] = hdf5_getters.get_tempo(h5, 0).item()
    tags = ''
    for tag in hdf5_getters.get_artist_terms(h5):
        tags += ('%s|' % tag)
    # Remove trailing pipe.
    tags = tags[:len(tags) - 1]
    f[features.index('tags')] = tags
    h5.close()
    return f
Esempio n. 8
0
def getInfo(files):
    data = []
    build_str = ''
    with open(sys.argv[1], 'r') as f:
        contents = f.read()
        c = contents.split()
    f.close()
    print("creating csv with following fields:" + contents)
    for i in c:
        build_str = build_str + i + ','
    build_str = build_str[:-1]
    build_str = build_str + '\n'
    for fil in files:
        curFile = getters.open_h5_file_read(fil)
        d2 = {}
        get_table = {'track_id': getters.get_track_id(curFile), 'segments_pitches': getters.get_segments_pitches(curFile), 'time_signature_confidence': getters.get_time_signature_confidence(curFile), 'song_hotttnesss': getters.get_song_hotttnesss(curFile), 'artist_longitude': getters.get_artist_longitude(curFile), 'tatums_confidence': getters.get_tatums_confidence(curFile), 'num_songs': getters.get_num_songs(curFile), 'duration': getters.get_duration(curFile), 'start_of_fade_out': getters.get_start_of_fade_out(curFile), 'artist_name': getters.get_artist_name(curFile), 'similar_artists': getters.get_similar_artists(curFile), 'artist_mbtags': getters.get_artist_mbtags(curFile), 'artist_terms_freq': getters.get_artist_terms_freq(curFile), 'release': getters.get_release(curFile), 'song_id': getters.get_song_id(curFile), 'track_7digitalid': getters.get_track_7digitalid(curFile), 'title': getters.get_title(curFile), 'artist_latitude': getters.get_artist_latitude(curFile), 'energy': getters.get_energy(curFile), 'key': getters.get_key(curFile), 'release_7digitalid': getters.get_release_7digitalid(curFile), 'artist_mbid': getters.get_artist_mbid(curFile), 'segments_confidence': getters.get_segments_confidence(curFile), 'artist_hotttnesss': getters.get_artist_hotttnesss(curFile), 'time_signature': getters.get_time_signature(curFile), 'segments_loudness_max_time': getters.get_segments_loudness_max_time(curFile), 'mode': getters.get_mode(curFile), 'segments_loudness_start': getters.get_segments_loudness_start(curFile), 'tempo': getters.get_tempo(curFile), 'key_confidence': getters.get_key_confidence(curFile), 'analysis_sample_rate': getters.get_analysis_sample_rate(curFile), 'bars_confidence': getters.get_bars_confidence(curFile), 'artist_playmeid': getters.get_artist_playmeid(curFile), 'artist_terms_weight': getters.get_artist_terms_weight(curFile), 'segments_start': getters.get_segments_start(curFile), 'artist_location': getters.get_artist_location(curFile), 'loudness': getters.get_loudness(curFile), 'year': getters.get_year(curFile), 'artist_7digitalid': getters.get_artist_7digitalid(curFile), 'audio_md5': getters.get_audio_md5(curFile), 'segments_timbre': getters.get_segments_timbre(curFile), 'mode_confidence': getters.get_mode_confidence(curFile), 'end_of_fade_in': getters.get_end_of_fade_in(curFile), 'danceability': getters.get_danceability(curFile), 'artist_familiarity': getters.get_artist_familiarity(curFile), 'artist_mbtags_count': getters.get_artist_mbtags_count(curFile), 'tatums_start': getters.get_tatums_start(curFile), 'artist_id': getters.get_artist_id(curFile), 'segments_loudness_max': getters.get_segments_loudness_max(curFile), 'bars_start': getters.get_bars_start(curFile), 'beats_start': getters.get_beats_start(curFile), 'artist_terms': getters.get_artist_terms(curFile), 'sections_start': getters.get_sections_start(curFile), 'beats_confidence': getters.get_beats_confidence(curFile), 'sections_confidence': getters.get_sections_confidence(curFile)}
        tid = fil.split('/')[-1].split('.')[0]
        # print(c)
        for i in c:
            if i in get_table: 
               d2[i] = get_table[i]
               d2[i] = str(d2[i]).replace('\n','')  
               build_str = build_str + d2[i] + ','
            else:
                print('error: unspecified field')
                exit(0)
        build_str = build_str[:-1]
        # print(build_str[:-1])
        build_str = build_str + '\n'
        curFile.close()
    build_str = build_str.replace('b','').replace("'",'').replace('"','')  
    return (build_str)
Esempio n. 9
0
def map(line):
	h5 = hdf5_getters.open_h5_file_read(line)
	if(h5):
		output_array=classifiers_base.classify(h5)
		yield(str(hdf5_getters.get_track_id(h5,0)),json.dumps(output_array))
		h5.close()
		del output_array
		gc.collect()		
def get_geography_information(h5file):
    h5 = GETTERS.open_h5_file_read(h5file)

    track_id = GETTERS.get_track_id(h5)
    artist_location = GETTERS.get_artist_location(h5)
    artist_latitude = GETTERS.get_artist_latitude(h5)
    artist_longitude = GETTERS.get_artist_longitude(h5)
    
    h5.close()

    return np.array([track_id, artist_location, artist_latitude, artist_longitude])
def get_geography_information(h5file):
    h5 = GETTERS.open_h5_file_read(h5file)

    track_id = GETTERS.get_track_id(h5)
    artist_location = GETTERS.get_artist_location(h5)
    artist_latitude = GETTERS.get_artist_latitude(h5)
    artist_longitude = GETTERS.get_artist_longitude(h5)

    h5.close()

    return np.array(
        [track_id, artist_location, artist_latitude, artist_longitude])
Esempio n. 12
0
def process_filelist_train(filelist=None,testsongs=None,tmpfilename=None):
    """
    Main function, process all files in the list (as long as their track_id
    is not in testsongs)
    INPUT
       filelist     - a list of song files
       testsongs    - set of track ID that we should not use
       tmpfilename  - where to save our processed features
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    # dimension fixed (12-dimensional timbre vector)
    ndim = 12
    finaldim = 90
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR ARTIST RECOGNITION')
    output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,finaldim),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'artist_id',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #',cnt_f
        # check what file/song is this
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        track_id = GETTERS.get_track_id(h5)
        if track_id in testsongs: # just in case, but should not be necessary
            print 'Found test track_id during training? weird.',track_id
            h5.close()
            continue
        # extract features, then close file
        processed_feats = compute_features(h5)
        h5.close()
        if processed_feats is None:
            continue
        # save features to tmp file
        output.root.data.artist_id.append( np.array( [artist_id] ) )
        output.root.data.feats.append( processed_feats )
    # we're done, close output
    output.close()
    return
Esempio n. 13
0
def func_to_get_desired_values(filename, returnValue = False):
    """
    This function does 3 simple things:
    - open the song file
    - get the elements we want and put them in
    - close the file
    INPUT : 
    filename    - The name of the h5 file to be loaded
    """
    global all_desired_data
    # Open file
    h5 = GETTERS.open_h5_file_read(filename)

    # Create and fill a record
    record = []
    for element in elementsRequested:
        result = getattr(GETTERS, element)(h5)
        try:
            if result == '':
                result = 'Adlen - void'
        except:
            pass
        try:
            if isinstance(result, np.ndarray):
                if len(result) > 1:
                    result = float(np.mean(result))
                else:
                    result = ''
        except:
            try:
                result = float(result)
            except:
                pass
        record.append(result)

    song_id = GETTERS.get_track_id(h5)
    artist_name = GETTERS.get_artist_name(h5)
    title = GETTERS.get_title(h5)
    artist_mbtags = GETTERS.get_artist_mbtags(h5)
    release = GETTERS.get_release(h5)

    song_id = unicode(song_id.decode('utf-8'))
    title = unicode(title.decode('utf-8'))
    artist_name = unicode(artist_name.decode('utf-8'))
    if not returnValue:
        all_desired_data.append([[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record])
    
    h5.close()
    
    if returnValue:
        return [[[song_id, title, artist_name, elementsRequested], artist_name, title, artist_mbtags, release], record]
Esempio n. 14
0
 def func_to_get_tags(filename):
     """
     This function does 3 simple things:
     - open the song file
     - get artist ID and put it
     - close the file
     """
     h5 = GETTERS.open_h5_file_read(filename)
     track_id = GETTERS.get_track_id(h5)
     tags = GETTERS.get_artist_mbtags(h5)
     if track_id in ids:
         for tag in tags:
             all_tags[tag] += 1
     h5.close()
def get_url(h5_file):
    artist_name = GETTERS.get_artist_name(h5_file)
    track_name = GETTERS.get_title(h5_file)
    echo_nest_id = GETTERS.get_track_id(h5_file).lower()

    if echo_nest_id >= 0:
        preview = get_preview_from_trackid(echo_nest_id)
        if preview != '':
            return preview

    res = get_trackid_from_text_search(track_name, artistname=artist_name)
    if len(res) > 0:
        closest_track = get_closest_track(res, track_name)
        preview = get_preview_from_trackid(closest_track['id'])
        return preview

    return None
Esempio n. 16
0
def getTrackInfo(starting_num):
    my_list = []
    f = hdf5_getters.open_h5_file_read(filepath)
    progress_bar = tqdm(range(tracks_per_thread))
    for iteration in progress_bar:
        i = int(iteration) + (starting_num*tracks_per_thread)
        track_id = hdf5_getters.get_track_id(f, i).decode()
        if track_id not in lyric_track_ids_set:
            continue # skip it an go on
        artist_name = hdf5_getters.get_artist_name(f, i).decode()
        duration = hdf5_getters.get_duration(f, i)
        loudness = hdf5_getters.get_loudness(f, i)
        tempo = hdf5_getters.get_tempo(f, i)
        title = hdf5_getters.get_title(f, i).decode()
        year = hdf5_getters.get_year(f, i)
        long_list = [track_id, artist_name, duration, loudness, tempo, title, year]
        my_list.append(long_list)
        progress_bar.set_description("Iteration %d" % i)
    f.close()
    return my_list
def get_all_data(target, basedir, ext='.h5') :

    # header
    target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                 "track_id", "song_id", "title", "artist_name", "artist_location",
                 "artist_hotttnesss", "release", "year", "song_hotttnesss",
                 "danceability", "duration", "loudness", "sample_rate", "tempo"
    ))

    count = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            for line in f:
                new_file = open("tmp.txt", 'w')
                new_file.write(line)

                h5 = hdf5_getters.open_h5_file_read(new_file)
                target.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
                              hdf5_getters.get_track_id(h5),
                              hdf5_getters.get_song_id(h5),
                              hdf5_getters.get_title(h5),
                              hdf5_getters.get_artist_name(h5),
                              hdf5_getters.get_artist_location(h5),
                              hdf5_getters.get_artist_hotttnesss(h5),
                              hdf5_getters.get_release(h5),
                              hdf5_getters.get_year(h5),
                              hdf5_getters.get_song_hotttnesss(h5),
                              hdf5_getters.get_danceability(h5),
                              hdf5_getters.get_duration(h5),
                              hdf5_getters.get_loudness(h5),
                              hdf5_getters.get_analysis_sample_rate(h5),
                              hdf5_getters.get_tempo(h5)
                ))

                # show progress
                count += 1
                print "%d/10000" % (count)

                h5.close()
Esempio n. 18
0
def get_all_attributes(filename):
    """
    This function does 3 simple things:
    - open the song file
    - get all required attributes
    - write it to a csv file 
    - close the files
    """
    with open('attributes.csv', 'a') as csvfile:
        try:
            # let's apply the previous function to all files
            csvwriter = csv.writer(csvfile, delimiter='\t')
            h5 = GETTERS.open_h5_file_read(filename)
            RESULTS = []
            RESULTS.append(GETTERS.get_year(h5))
            RESULTS.append(GETTERS.get_artist_id(h5))
            RESULTS.append(GETTERS.get_artist_name(h5))
            RESULTS.append(GETTERS.get_artist_mbid(h5))
            RESULTS.append(convert_terms(GETTERS.get_artist_terms(h5)))
            RESULTS.append(GETTERS.get_artist_hotttnesss(h5))
            RESULTS.append(GETTERS.get_artist_latitude(h5))
            RESULTS.append(GETTERS.get_artist_longitude(h5))
            RESULTS.append(GETTERS.get_artist_familiarity(h5))
            RESULTS.append(GETTERS.get_danceability(h5))
            RESULTS.append(GETTERS.get_duration(h5))
            RESULTS.append(GETTERS.get_energy(h5))
            RESULTS.append(GETTERS.get_loudness(h5))
            RESULTS.append(GETTERS.get_song_hotttnesss(h5))
            RESULTS.append(GETTERS.get_song_id(h5))
            RESULTS.append(GETTERS.get_tempo(h5))
            RESULTS.append(GETTERS.get_time_signature(h5))
            RESULTS.append(GETTERS.get_title(h5))
            RESULTS.append(GETTERS.get_track_id(h5))
            RESULTS.append(GETTERS.get_release(h5))
            csvwriter.writerow(RESULTS)
            h5.close()
        except AttributeError:
            pass
Esempio n. 19
0
 def func_to_get_track_data(filename):
     h5 = GETTERS.open_h5_file_read(filename)
     track_id = GETTERS.get_track_id(h5)
     funcs = [
         GETTERS.get_duration, GETTERS.get_key, GETTERS.get_loudness,
         GETTERS.get_mode, GETTERS.get_tempo, GETTERS.get_time_signature
     ]
     segment_funcs = [
         GETTERS.get_segments_pitches, GETTERS.get_segments_timbre,
         GETTERS.get_segments_loudness_max,
         GETTERS.get_segments_loudness_max_time
     ]
     if track_id in tracks:
         row_data = [track_id]
         for func in funcs:
             row_data.append(func(h5))
         for func in segment_funcs:
             segment_mat = func(h5)
             row_data += condense_segments(segment_mat,
                                           new_size=num_segments)
         list_of_rows.append(row_data)
     h5.close()
     return None
Esempio n. 20
0
def feat_from_file(path):
    """
    Extract a list of features in an array, already converted to string
    """
    feats = []
    h5 = GETTERS.open_h5_file_read(path)
    # basic info
    feats.append(GETTERS.get_track_id(h5))
    #feats.append( GETTERS.get_artist_name(h5).replace(',','') )
    #feats.append( GETTERS.get_title(h5).replace(',','') )
    feats.append(GETTERS.get_loudness(h5))
    feats.append(GETTERS.get_tempo(h5))
    feats.append(GETTERS.get_time_signature(h5))
    feats.append(GETTERS.get_key(h5))
    feats.append(GETTERS.get_mode(h5))
    feats.append(GETTERS.get_duration(h5))
    feats.append(GETTERS.get_hotnesss(h5))

    segments_loudness = np.asarray(GETTERS.get_segments_loudness_max(h5))
    max_segment_indice = np.argmax(segments_loudness)
    # timbre
    timbre = GETTERS.get_segments_timbre(h5)
    max_segment_timbre = timbre[max_segment_indice, :]
    avg_timbre = np.average(timbre, axis=0)
    for k in avg_timbre:
        feats.append(k)
    var_timbre = np.var(timbre, axis=0)
    for k in var_timbre:
        feats.append(k)
    for k in max_segment_timbre:
        feats.append(k)
    # done with h5 file
    h5.close()
    # makes sure we return strings
    feats = [str(x) for x in feats]
    return feats
def get_all_data(dir,ext,outDir):
    for root, dirs, files in os.walk(dir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
		h5 = get.open_h5_file_read(f)
		keys = {}
		
		keys['trackId'] = get.get_track_id(h5)
		#keys['analysisSampleRate'] = get.get_analysis_sample_rate(h5)

		#keys['artistMbTags'] = arrToStr(get.get_artist_mbtags(h5))
		#keys['artistMbId'] = get.get_artist_mbid(h5)
		#keys['artistName'] = get.get_artist_name(h5)
		#keys['artistPlayMeId'] = get.get_artist_playmeid(h5)
		
		#keys['artistTermsWeightArr'] = arrToStr(get.get_artist_terms_weight(h5)) #array
                #keys['artistTermsFreqArr'] = arrToStr(get.get_artist_terms_freq(h5))  #array
                #keys['artistTermsArr'] = arrToStr(get.get_artist_terms(h5)) # array"""

		#keys['audioMd5'] = get.get_audio_md5(h5)
		
                #keys['barsConfidence'] = arrToStr(get.get_bars_confidence(h5)) #
		#keys['barsStart'] = arrToStr(get.get_bars_start(h5)) #

		#keys['beatsConfidence'] = arrToStr(get.get_beats_confidence(h5)) #
		#keys['beatsStart']= arrToStr(get.get_beats_start(h5)) #

		#keys['danceability'] = get.get_danceability(h5)
		#keys['duration'] = get.get_duration(h5)
		#keys['endofFadeIn'] = get.get_end_of_fade_in(h5)
		#keys['energy'] = get.get_energy(h5)
		#keys['key'] = get.get_key(h5)
		keys['loudness'] = get.get_loudness(h5)
		#keys['keyConfidence'] = get.get_key_confidence(h5)
		#keys['mode'] = get.get_mode(h5)
		#keys['release'] = get.get_release(h5)
		#keys['release7DigitalId'] = get.get_release_7digitalid(h5)
		#keys['modeConfidence'] = get.get_mode_confidence(h5)

		#keys['sectionsConfidenceArr'] = arrToStr(get.get_sections_confidence(h5)) #
		#keys['sectionsStartArr'] = arrToStr(get.get_sections_start(h5))
	
		#Segments data begin
		keys['segmentsStartArr'] = arrToStr(get.get_segments_start(h5)) #
		#keys['segmentsConfidenceArr'] = arrToStr(get.get_segments_confidence(h5)) #
		#keys['segmentsLoudnessMaxArr'] = arrToStr(get.get_segments_loudness_max_time(h5)) #
		#keys['segmentsLoudnessMaxTimeArr'] = arrToStr(get.get_segments_loudness_max_time(h5)) #
		#keys['segmentsLoudnessStartArr'] = arrToStr(get.get_segments_loudness_start(h5)) #
		keys['segmentsPitchesArr'] = arrToStr((np.array(get.get_segments_pitches(h5))).flatten()) #
		keys['segmentsTimbreArr'] = arrToStr((np.array(get.get_segments_timbre(h5))).flatten()) #Timbre data	
		#Segments data End
		#keys['similarArtists'] = arrToStr(get.get_similar_artists(h5))	
		#keys['songHotness'] = get.get_song_hotttnesss(h5)
		keys['songId'] = get.get_song_id(h5)	
		#keys['startOfFadeOut'] = get.get_start_of_fade_out(h5)
                #keys['tatumsConfidence'] = arrToStr(get.get_tatums_confidence(h5)) #
		#keys['tatumsStart'] = arrToStr(get.get_tatums_start(h5)) #
		#keys['tempo'] = get.get_tempo(h5)
		#keys['timeSignature'] = get.get_time_signature(h5)
                #keys['timeSignatureConfidence'] = get.get_time_signature_confidence(h5)
                keys['title'] = get.get_title(h5)
                #keys['track7DigitalId'] = get.get_track_7digitalid(h5)
                #keys['year'] = get.get_year(h5)
	
            	
		with open(outDir,'a') as f:
			for key in ordering:
				f.write('{0}\t'.format(keys[key]))
			f.write('\n')
		"""
		for key in keys:
			print key 
			print "--->  " 
			print keys[key]
		"""
		h5.close()
    counter = 0
    file_io_counter = 0
    for root, dirs, files in os.walk(os.getcwd()):
        for name in files:
            if name.endswith(".h5"):
                if process_completion % 10000 == 0:
                    print "done :", process_completion/10000.0, "%"
                    process_completion += 1
                else:
                    process_completion += 1

                tempPath = os.path.abspath(os.path.join(root,name))
                h5file = hdf5_getters.open_h5_file_read(tempPath)

                #meta info
                track_id_str = hdf5_getters.get_track_id(h5file).replace(",","")
                song_id_str = hdf5_getters.get_song_id(h5file).replace(",","")
                title_str = hdf5_getters.get_title(h5file).replace(","," ")
                artist_name_str = hdf5_getters.get_artist_name(h5file).replace(","," ")
                artist_location_str = hdf5_getters.get_artist_location(h5file).replace(","," ")

                # song-level data info
                duration = hdf5_getters.get_duration(h5file)
                key = hdf5_getters.get_key(h5file)
                mode = hdf5_getters.get_mode(h5file)
                tempo = hdf5_getters.get_tempo(h5file)
                time_signature = hdf5_getters.get_time_signature(h5file)

                h5file.close()

                counter += 1
Esempio n. 23
0
 def func_to_get_track_ids(filename):
     h5 = GETTERS.open_h5_file_read(filename)
     id_ = GETTERS.get_track_id(h5)
     subset_ids.add(id_)
     h5.close()
Esempio n. 24
0
def process_filelist_train(filelist=None,
                           testartists=None,
                           tmpfilename=None,
                           npicks=None,
                           winsize=None,
                           finaldim=None,
                           typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'feats',
                        tables.Float64Atom(shape=()), (0, finaldim),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'track_id',
                        tables.StringAtom(18, shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #', cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append(np.array([year] * n_p_feats))
        output.root.data.track_id.append(np.array([track_id] * n_p_feats))
        output.root.data.feats.append(processed_feats)
    # we're done, close output
    output.close()
    return
def data_to_flat_file(basedir,ext='.h5') :
    """This function extract the information from the tables and creates the flat file."""	
    count = 0;	#song counter
    list_to_write= []
    row_to_write = ""
    writer = csv.writer(open("metadata_wholeA.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
	    print f	#the name of the file
            h5 = hdf5_getters.open_h5_file_read(f)
	    title = hdf5_getters.get_title(h5) 
	    title= title.replace('"','') 
	    comma=title.find(',')	#eliminating commas in the title
	    if	comma != -1:
		    print title
		    time.sleep(1)
	    album = hdf5_getters.get_release(h5)
	    album= album.replace('"','')	#eliminating commas in the album	
	    comma=album.find(',')
	    if	comma != -1:
		    print album
		    time.sleep(1)
	    artist_name = hdf5_getters.get_artist_name(h5)
	    comma=artist_name.find(',')
	    if	comma != -1:
		    print artist_name
		    time.sleep(1)
	    artist_name= artist_name.replace('"','')	#eliminating double quotes
	    duration = hdf5_getters.get_duration(h5)
	    samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
	    artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
	    artist_fam = hdf5_getters.get_artist_familiarity(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_fam) == True:
	            artist_fam=-1
	    artist_hotness= hdf5_getters.get_artist_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_hotness) == True:
	            artist_hotness=-1
	    artist_id = hdf5_getters.get_artist_id(h5)
	    artist_lat = hdf5_getters.get_artist_latitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lat) == True:
	            artist_lat=-1
	    artist_loc = hdf5_getters.get_artist_location(h5)
		#checks artist_loc to see if it is a hyperlink if it is set as empty string
	    artist_loc = artist_loc.replace(",", "\,");
	    if artist_loc.startswith("<a"):
                artist_loc = ""
	    if len(artist_loc) > 100:
                artist_loc = ""
	    artist_lon = hdf5_getters.get_artist_longitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lon) == True:
	            artist_lon=-1
	    artist_mbid = hdf5_getters.get_artist_mbid(h5)
	    artist_pmid = hdf5_getters.get_artist_playmeid(h5)
	    audio_md5 = hdf5_getters.get_audio_md5(h5)
	    danceability = hdf5_getters.get_danceability(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(danceability) == True:
	            danceability=-1
	    end_fade_in =hdf5_getters.get_end_of_fade_in(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(end_fade_in) == True:
	            end_fade_in=-1
	    energy = hdf5_getters.get_energy(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(energy) == True:
	            energy=-1
            song_key = hdf5_getters.get_key(h5)
	    key_c = hdf5_getters.get_key_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(key_c) == True:
	            key_c=-1
	    loudness = hdf5_getters.get_loudness(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(loudness) == True:
	            loudness=-1
	    mode = hdf5_getters.get_mode(h5)
	    mode_conf = hdf5_getters.get_mode_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(mode_conf) == True:
	            mode_conf=-1
	    release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
	    song_hot = hdf5_getters.get_song_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(song_hot) == True:
	            song_hot=-1
	    song_id = hdf5_getters.get_song_id(h5)
	    start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
	    tempo = hdf5_getters.get_tempo(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(tempo) == True:
	            tempo=-1
	    time_sig = hdf5_getters.get_time_signature(h5)
	    time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(time_sig_c) == True:
	            time_sig_c=-1
	    track_id = hdf5_getters.get_track_id(h5)
	    track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
	    year = hdf5_getters.get_year(h5)
	    bars_c = hdf5_getters.get_bars_confidence(h5)
	    bars_c_avg= get_avg(bars_c)
	    bars_c_max= get_max(bars_c)
	    bars_c_min = get_min(bars_c)
	    bars_c_stddev= get_stddev(bars_c)
	    bars_c_count = get_count(bars_c)
	    bars_c_sum = get_sum(bars_c)
	    bars_start = hdf5_getters.get_bars_start(h5)
	    bars_start_avg = get_avg(bars_start)
	    bars_start_max= get_max(bars_start)
	    bars_start_min = get_min(bars_start)
	    bars_start_stddev= get_stddev(bars_start)
	    bars_start_count = get_count(bars_start)
	    bars_start_sum = get_sum(bars_start)
            beats_c = hdf5_getters.get_beats_confidence(h5)
            beats_c_avg= get_avg(beats_c)
	    beats_c_max= get_max(beats_c)
	    beats_c_min = get_min(beats_c)
	    beats_c_stddev= get_stddev(beats_c)
	    beats_c_count = get_count(beats_c)
	    beats_c_sum = get_sum(beats_c)
            beats_start = hdf5_getters.get_beats_start(h5)
 	    beats_start_avg = get_avg(beats_start)
	    beats_start_max= get_max(beats_start)
	    beats_start_min = get_min(beats_start)
	    beats_start_stddev= get_stddev(beats_start)
	    beats_start_count = get_count(beats_start)
	    beats_start_sum = get_sum(beats_start)
	    sec_c = hdf5_getters.get_sections_confidence(h5)
            sec_c_avg= get_avg(sec_c)
	    sec_c_max= get_max(sec_c)
	    sec_c_min = get_min(sec_c)
	    sec_c_stddev= get_stddev(sec_c)
	    sec_c_count = get_count(sec_c)
	    sec_c_sum = get_sum(sec_c)
	    sec_start = hdf5_getters.get_sections_start(h5)
            sec_start_avg = get_avg(sec_start)
	    sec_start_max= get_max(sec_start)
	    sec_start_min = get_min(sec_start)
	    sec_start_stddev= get_stddev(sec_start)
	    sec_start_count = get_count(sec_start)
	    sec_start_sum = get_sum(sec_start)
	    seg_c = hdf5_getters.get_segments_confidence(h5)
	    seg_c_avg= get_avg(seg_c)
	    seg_c_max= get_max(seg_c)
	    seg_c_min = get_min(seg_c)
	    seg_c_stddev= get_stddev(seg_c)
	    seg_c_count = get_count(seg_c)
	    seg_c_sum = get_sum(seg_c)
            seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
            seg_loud_max_avg= get_avg(seg_loud_max)
	    seg_loud_max_max= get_max(seg_loud_max)
	    seg_loud_max_min = get_min(seg_loud_max)
	    seg_loud_max_stddev= get_stddev(seg_loud_max)
	    seg_loud_max_count = get_count(seg_loud_max)
	    seg_loud_max_sum = get_sum(seg_loud_max)
	    seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
	    seg_loud_max_time_avg= get_avg(seg_loud_max_time)
	    seg_loud_max_time_max= get_max(seg_loud_max_time)
	    seg_loud_max_time_min = get_min(seg_loud_max_time)
	    seg_loud_max_time_stddev= get_stddev(seg_loud_max_time)
	    seg_loud_max_time_count = get_count(seg_loud_max_time)
	    seg_loud_max_time_sum = get_sum(seg_loud_max_time)
	    seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
	    seg_loud_start_avg= get_avg(seg_loud_start)
	    seg_loud_start_max= get_max(seg_loud_start)
	    seg_loud_start_min = get_min(seg_loud_start)
	    seg_loud_start_stddev= get_stddev(seg_loud_start)
	    seg_loud_start_count = get_count(seg_loud_start)
	    seg_loud_start_sum = get_sum(seg_loud_start)					      
	    seg_pitch = hdf5_getters.get_segments_pitches(h5)
	    pitch_size = len(seg_pitch)
	    seg_start = hdf5_getters.get_segments_start(h5)
	    seg_start_avg= get_avg(seg_start)
	    seg_start_max= get_max(seg_start)
	    seg_start_min = get_min(seg_start)
	    seg_start_stddev= get_stddev(seg_start)
	    seg_start_count = get_count(seg_start)
	    seg_start_sum = get_sum(seg_start)
	    seg_timbre = hdf5_getters.get_segments_timbre(h5)
	    tatms_c = hdf5_getters.get_tatums_confidence(h5)
	    tatms_c_avg= get_avg(tatms_c)
	    tatms_c_max= get_max(tatms_c)
	    tatms_c_min = get_min(tatms_c)
	    tatms_c_stddev= get_stddev(tatms_c)
	    tatms_c_count = get_count(tatms_c)
	    tatms_c_sum = get_sum(tatms_c)
	    tatms_start = hdf5_getters.get_tatums_start(h5)
	    tatms_start_avg= get_avg(tatms_start)
	    tatms_start_max= get_max(tatms_start)
	    tatms_start_min = get_min(tatms_start)
	    tatms_start_stddev= get_stddev(tatms_start)
	    tatms_start_count = get_count(tatms_start)
	    tatms_start_sum = get_sum(tatms_start)
	
	    #Getting the genres
	    genre_set = 0    #flag to see if the genre has been set or not
	    art_trm = hdf5_getters.get_artist_terms(h5)
	    trm_freq = hdf5_getters.get_artist_terms_freq(h5)
	    trn_wght = hdf5_getters.get_artist_terms_weight(h5)
	    a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
	    genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq
	    final_genre=[]
	    genres_so_far=[]
	    for i in range(len(genre_indexes)):
		    genre_tmp=get_genre(art_trm,genre_indexes[i])   #genre that corresponds to the highest freq
		    genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary
		    if len(genres_so_far) != 0:
			    for i in genres_so_far:
				final_genre.append(i)
				genre_set=1				#genre was found in dictionary
				  
		
	    
	    if genre_set == 1:
		    col_num=[]
		   
		    for genre in final_genre:
			    column=int(genre)				#getting the column number of the genre
			    col_num.append(column)

		    genre_array=genre_columns(col_num)	         #genre array
 	    else:
		    genre_array=genre_columns(-1)		#the genre was not found in the dictionary

	    transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_pitch_avg=[]
	    seg_pitch_max=[]
	    seg_pitch_min=[]
            seg_pitch_stddev=[]
            seg_pitch_count=[]
	    seg_pitch_sum=[]
            i=0
	    #Getting the aggregate values in the pitches array
	    for row in transpose_pitch:
		   seg_pitch_avg.append(get_avg(row))
		   seg_pitch_max.append(get_max(row))
	           seg_pitch_min.append(get_min(row))
		   seg_pitch_stddev.append(get_stddev(row))
		   seg_pitch_count.append(get_count(row))
                   seg_pitch_sum.append(get_sum(row))
		   i=i+1

	    #extracting information from the timbre array 
            transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_timbre_avg=[]
	    seg_timbre_max=[]
	    seg_timbre_min=[]
            seg_timbre_stddev=[]
            seg_timbre_count=[]
	    seg_timbre_sum=[]
            i=0
	    for row in transpose_timbre:
		   seg_timbre_avg.append(get_avg(row))
		   seg_timbre_max.append(get_max(row))
	           seg_timbre_min.append(get_min(row))
		   seg_timbre_stddev.append(get_stddev(row))
		   seg_timbre_count.append(get_count(row))
                   seg_timbre_sum.append(get_sum(row))
		   i=i+1
		


		#Writing to the flat file
            writer.writerow([title,album,artist_name,year,duration,seg_start_count, tempo])

	    h5.close()
	    count=count+1;
	    print count;
Esempio n. 26
0
def parse_aggregate_songs(file_name,file_name2,artist_map):
    """
    Given an aggregate filename and artist_map in the format
    {artist_name: {data pertaining to artist}}
    """
    """
    TODO: 
    -this function goes through each song, if artist not in there,
    add all data necesary and add first song info.
    else update any specific song info

    -song info is a map from attributename:[values]
    """
    #artist_map = {}
    h5 = hdf5_getters.open_h5_file_read(file_name)
    numSongs = hdf5_getters.get_num_songs(h5)
    print 'Parsing song file...'
    for i in range(numSongs):
        artist_name = hdf5_getters.get_artist_name(h5,i)

        #Filter location
        longi = hdf5_getters.get_artist_longitude(h5,i)
        lat = hdf5_getters.get_artist_latitude(h5,i)
        loc = hdf5_getters.get_artist_location(h5,i)
        if math.isnan(lat) or math.isnan(longi):
            #skip if no location
            continue

        #filter year
        yr = hdf5_getters.get_year(h5,i)
        if yr == 0:
            #skip if no year
            continue

        #filter hotttness and familiarity
        familiarity = hdf5_getters.get_artist_familiarity(h5,i)
        hotttness = hdf5_getters.get_artist_hotttnesss(h5,i)
        if familiarity<=0.0 or hotttness<=0.0:
            #skip if no hotttness or familiarity computations
            continue

        #TODO:MAYBE filter on dance and energy
        timbre = hdf5_getters.get_segments_timbre(h5,i)
        #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each
        if not artist_name in artist_map:
            #have not encountered the artist yet, so populate new map
            sub_map = {}
            sub_map['artist_familiarity'] = familiarity
            sub_map['artist_hotttnesss'] = hotttness
            sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i)
            #longi = hdf5_getters.get_artist_longitude(h5,i)
            #lat = hdf5_getters.get_artist_latitude(h5,i)
            #longi = None if math.isnan(longi) else longi
            #lat = None if math.isnan(lat) else lat
            sub_map['artist_latitude'] = lat
            sub_map['artist_longitude'] = longi
            sub_map['artist_location'] = loc
            sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i)
            #TODO:see if should weight by freq or weight for if the term matches one of the feature terms
            sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i))
            sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i))

            #song-sepcific data
            #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA:
            #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            sub_map['danceability'] = [dance]
            sub_map['duration'] = [hdf5_getters.get_duration(h5,i)]
            sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)]
            sub_map['energy'] = [energy]
            #since each song has a key, ask if feature for keys should be num of songs that appear in that key or
            #just binary if any of their songs has that key or just be avg of songs with that key
            #same for mode, since its either major or minor...should it be count or avg.?
            sub_map['key'] = [hdf5_getters.get_key(h5,i)]
            sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)]
            sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            sub_map['song_hotttnesss'] = [s_hot]
            sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)]
            sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)]
            #should time signature be count as well? binary?
            sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)]
            sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)]
            #should year be binary since they can have many songs across years and should it be year:count
            sub_map['year'] = [yr]

            artist_map[artist_name] = sub_map
        else:
            #artist already exists, so get its map and update song fields
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            artist_map[artist_name]['danceability'].append(dance)
            artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i))
            artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i))
            artist_map[artist_name]['energy'].append(energy)
            artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i))
            artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i))
            artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            artist_map[artist_name]['song_hotttnesss'].append(s_hot)
            artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i))
            artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i))
            #should time signature be count as well? binary?
            artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i))
            artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i))
            #should year be binary since they can have many songs across years and should it be year:count
            artist_map[artist_name]['year'].append(yr)

    h5 = hdf5_getters.open_h5_file_read(file_name2)
    numSongs = hdf5_getters.get_num_songs(h5)
    print 'Parsing song file2...'
    for i in range(numSongs):
        song_id = hdf5_getters.get_track_id(h5,i)
        artist_name = hdf5_getters.get_artist_name(h5,i)
        if artist_name in artist_map and song_id in artist_map[artist_name]['track_id']:
            continue

        #Filter location
        longi = hdf5_getters.get_artist_longitude(h5,i)
        lat = hdf5_getters.get_artist_latitude(h5,i)
        loc = hdf5_getters.get_artist_location(h5,i)
        if math.isnan(lat) or math.isnan(longi):
            #skip if no location
            continue

        #filter year
        yr = hdf5_getters.get_year(h5,i)
        if yr == 0:
            #skip if no year
            continue

        #filter hotttness and familiarity
        familiarity = hdf5_getters.get_artist_familiarity(h5,i)
        hotttness = hdf5_getters.get_artist_hotttnesss(h5,i)
        if familiarity<=0.0 or hotttness<=0.0:
            #skip if no hotttness or familiarity computations
            continue

        #TODO:MAYBE filter on dance and energy
        timbre = hdf5_getters.get_segments_timbre(h5,i)
        #timbre[#] gives len 12 array so for each arr in timbre, add up to get segment and add to corresponding 12 features and avg across each
        if not artist_name in artist_map:
            #have not encountered the artist yet, so populate new map
            sub_map = {}
            sub_map['artist_familiarity'] = familiarity
            sub_map['artist_hotttnesss'] = hotttness
            sub_map['artist_id'] = hdf5_getters.get_artist_id(h5,i)
            #longi = hdf5_getters.get_artist_longitude(h5,i)
            #lat = hdf5_getters.get_artist_latitude(h5,i)
            #longi = None if math.isnan(longi) else longi
            #lat = None if math.isnan(lat) else lat
            sub_map['artist_latitude'] = lat
            sub_map['artist_longitude'] = longi
            sub_map['artist_location'] = loc
            sub_map['artist_terms'] = hdf5_getters.get_artist_terms(h5,i)
            #TODO:see if should weight by freq or weight for if the term matches one of the feature terms
            sub_map['artist_terms_freq'] = list(hdf5_getters.get_artist_terms_freq(h5,i))
            sub_map['artist_terms_weight'] = list(hdf5_getters.get_artist_terms_weight(h5,i))

            #song-sepcific data
            #TODO COMPUTE AN AVG TIMBRE FOR A SONG BY IDEA:
            #SUMMING DOWN EACH 12 VECTOR FOR EACH PT IN SONG AND AVG THIS ACROSS SONG
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            sub_map['danceability'] = [dance]
            sub_map['duration'] = [hdf5_getters.get_duration(h5,i)]
            sub_map['end_of_fade_in'] = [hdf5_getters.get_end_of_fade_in(h5,i)]
            sub_map['energy'] = [energy]
            #since each song has a key, ask if feature for keys should be num of songs that appear in that key or
            #just binary if any of their songs has that key or just be avg of songs with that key
            #same for mode, since its either major or minor...should it be count or avg.?
            sub_map['key'] = [hdf5_getters.get_key(h5,i)]
            sub_map['loudness'] = [hdf5_getters.get_loudness(h5,i)]
            sub_map['mode'] = [hdf5_getters.get_mode(h5,i)] #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            sub_map['song_hotttnesss'] = [s_hot]
            sub_map['start_of_fade_out'] = [hdf5_getters.get_start_of_fade_out(h5,i)]
            sub_map['tempo'] = [hdf5_getters.get_tempo(h5,i)]
            #should time signature be count as well? binary?
            sub_map['time_signature'] = [hdf5_getters.get_time_signature(h5,i)]
            sub_map['track_id'] = [hdf5_getters.get_track_id(h5,i)]
            #should year be binary since they can have many songs across years and should it be year:count
            sub_map['year'] = [yr]

            artist_map[artist_name] = sub_map
        else:
            #artist already exists, so get its map and update song fields
            dance = hdf5_getters.get_danceability(h5,i)
            dance = None if dance == 0.0 else dance
            energy = hdf5_getters.get_energy(h5,i)
            energy = None if energy == 0.0 else energy
            artist_map[artist_name]['danceability'].append(dance)
            artist_map[artist_name]['duration'].append(hdf5_getters.get_duration(h5,i))
            artist_map[artist_name]['end_of_fade_in'].append(hdf5_getters.get_end_of_fade_in(h5,i))
            artist_map[artist_name]['energy'].append(energy)
            artist_map[artist_name]['key'].append(hdf5_getters.get_key(h5,i))
            artist_map[artist_name]['loudness'].append(hdf5_getters.get_loudness(h5,i))
            artist_map[artist_name]['mode'].append(hdf5_getters.get_mode(h5,i)) #major or minor 0/1
            s_hot = hdf5_getters.get_song_hotttnesss(h5,i)
            s_hot = None if math.isnan(s_hot) else s_hot
            artist_map[artist_name]['song_hotttnesss'].append(s_hot)
            artist_map[artist_name]['start_of_fade_out'].append(hdf5_getters.get_start_of_fade_out(h5,i))
            artist_map[artist_name]['tempo'].append(hdf5_getters.get_tempo(h5,i))
            #should time signature be count as well? binary?
            artist_map[artist_name]['time_signature'].append(hdf5_getters.get_time_signature(h5,i))
            artist_map[artist_name]['track_id'].append(hdf5_getters.get_track_id(h5,i))
            #should year be binary since they can have many songs across years and should it be year:count
            artist_map[artist_name]['year'].append(yr)
Esempio n. 27
0
h5 = hdf5_getters.open_h5_file_read("files/msd_summary_file.h5")
data = []

length = hdf5_getters.get_num_songs(h5)

print("number of songs = ",length)

count = 0;
for i in range(0,length):
	tmp = [];
	if hdf5_getters.get_year(h5,songidx=i) == 0 :
		continue;
	#if math.isnan(hdf5_getters.get_artist_latitude(h5,songidx=i)) and hdf5_getters.get_artist_location(h5,songidx=i) =='':
	#	continue;
	count+=1;
	tmp.append(str(hdf5_getters.get_track_id(h5,songidx=i)).replace("b'","").replace("'",""));	
	tmp.append(hdf5_getters.get_year(h5,songidx=i)); #0
	tmp.append(hdf5_getters.get_song_hotttnesss(h5,songidx=i)); #1
	tmp.append(str(hdf5_getters.get_title(h5,songidx=i)).replace("b'","").replace("'",""));	#2
	tmp.append(str(hdf5_getters.get_artist_id(h5,songidx=i)).replace("b'","").replace("'","")); #3	
	tmp.append(hdf5_getters.get_artist_latitude(h5,songidx=i)); #4
	tmp.append(hdf5_getters.get_artist_longitude(h5,songidx=i)); #5
	tmp.append(str(hdf5_getters.get_artist_location(h5,songidx=i)).replace("b'","").replace("'","")); #6
	tmp.append(str(hdf5_getters.get_artist_name(h5,songidx=i)).replace("b'","").replace("'","")); #7
	tmp.append(str(hdf5_getters.get_song_id(h5,songidx=i)).replace("b'","").replace("'",""));	
	data.append(tmp)
	print(count)

h5.close()
data = sorted(data, key = operator.itemgetter(1))
Esempio n. 28
0
def fill_attributes(song, songH5File):

    #----------------------------non array attributes-------------------------------
    song.analysisSampleRate = str(
        hdf5_getters.get_analysis_sample_rate(songH5File))
    song.artistDigitalID = str(hdf5_getters.get_artist_7digitalid(songH5File))
    song.artistFamiliarity = str(
        hdf5_getters.get_artist_familiarity(songH5File))
    song.artistHotness = str(hdf5_getters.get_artist_hottness(songH5File))
    song.artistID = str(hdf5_getters.get_artist_id(songH5File))
    song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File))
    song.artistLocation = str(hdf5_getters.get_artist_location(songH5File))
    song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File))
    song.artistmbID = str(hdf5_getters.get_artist_mbid(songH5File))
    song.artistName = str(hdf5_getters.get_artist_name(songH5File))
    song.artistPlayMeID = str(hdf5_getters.get_artist_playmeid(songH5File))
    song.audioMD5 = str(hdf5_getters.get_audio_md5(songH5File))
    song.danceability = str(hdf5_getters.get_danceability(songH5File))
    song.duration = str(hdf5_getters.get_duration(songH5File))
    song.endOfFadeIn = str(hdf5_getters.get_end_of_fade_in(songH5File))
    song.energy = str(hdf5_getters.get_energy(songH5File))
    song.key = str(hdf5_getters.get_key(songH5File))
    song.keyConfidence = str(hdf5_getters.get_key_confidence(songH5File))
    song.segementsConfidence = str(
        hdf5_getters.get_segments_confidence(songH5File))
    song.segementsConfidence = str(
        hdf5_getters.get_sections_confidence(songH5File))
    song.loudness = str(hdf5_getters.get_loudness(songH5File))
    song.mode = str(hdf5_getters.get_mode(songH5File))
    song.modeConfidence = str(hdf5_getters.get_mode_confidence(songH5File))
    song.release = str(hdf5_getters.get_release(songH5File))
    song.releaseDigitalID = str(
        hdf5_getters.get_release_7digitalid(songH5File))
    song.songHotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File))
    song.startOfFadeOut = str(hdf5_getters.get_start_of_fade_out(songH5File))
    song.tempo = str(hdf5_getters.get_tempo(songH5File))
    song.timeSignature = str(hdf5_getters.get_time_signature(songH5File))
    song.timeSignatureConfidence = str(
        hdf5_getters.get_time_signature_confidence(songH5File))
    song.title = str(hdf5_getters.get_title(songH5File))
    song.trackID = str(hdf5_getters.get_track_id(songH5File))
    song.trackDigitalID = str(hdf5_getters.get_track_7digitalid(songH5File))
    song.year = str(hdf5_getters.get_year(songH5File))

    #-------------------------------array attributes--------------------------------------
    #array float
    song.beatsStart_mean, song.beatsStart_var = convert_array_to_meanvar(
        hdf5_getters.get_beats_start(songH5File))
    #array float
    song.artistTermsFreq_mean, song.artistTermsFreq_var = convert_array_to_meanvar(
        hdf5_getters.get_artist_terms_freq(songH5File))
    #array float
    song.artistTermsWeight_mean, song.artistTermsWeight_var = convert_array_to_meanvar(
        hdf5_getters.get_artist_terms_weight(songH5File))
    #array int
    song.artistmbTagsCount_mean, song.artistmbTagsCount_var = convert_array_to_meanvar(
        hdf5_getters.get_artist_mbtags_count(songH5File))
    #array float
    song.barsConfidence_mean, song.barsConfidence_var = convert_array_to_meanvar(
        hdf5_getters.get_bars_confidence(songH5File))
    #array float
    song.barsStart_mean, song.barsStart_var = convert_array_to_meanvar(
        hdf5_getters.get_bars_start(songH5File))
    #array float
    song.beatsConfidence_mean, song.beatsConfidence_var = convert_array_to_meanvar(
        hdf5_getters.get_beats_confidence(songH5File))
    #array float
    song.sectionsConfidence_mean, song.sectionsConfidence_var = convert_array_to_meanvar(
        hdf5_getters.get_sections_confidence(songH5File))
    #array float
    song.sectionsStart_mean, song.sectionsStart_var = convert_array_to_meanvar(
        hdf5_getters.get_sections_start(songH5File))
    #array float
    song.segmentsConfidence_mean, song.segmentsConfidence_var = convert_array_to_meanvar(
        hdf5_getters.get_segments_confidence(songH5File))
    #array float
    song.segmentsLoudness_mean, song.segmentsLoudness_var = convert_array_to_meanvar(
        hdf5_getters.get_segments_loudness_max(songH5File))
    #array float
    song.segmentsLoudnessMaxTime_mean, song.segmentsLoudnessMaxTime_var = convert_array_to_meanvar(
        hdf5_getters.get_segments_loudness_max_time(songH5File))
    #array float
    song.segmentsLoudnessMaxStart_mean, song.segmentsLoudnessMaxStart_var = convert_array_to_meanvar(
        hdf5_getters.get_segments_loudness_start(songH5File))
    #array float
    song.segmentsStart_mean, song.segmentsStart_var = convert_array_to_meanvar(
        hdf5_getters.get_segments_start(songH5File))
    #array float
    song.tatumsConfidence_mean, song.tatumsConfidence_var = convert_array_to_meanvar(
        hdf5_getters.get_tatums_confidence(songH5File))
    #array float
    song.tatumsStart_mean, song.tatumsStart_var = convert_array_to_meanvar(
        hdf5_getters.get_tatums_start(songH5File))
    #array2d float
    song.segmentsTimbre_mean, song.segmentsTimbre_var = covert_2darray_to_meanvar(
        hdf5_getters.get_segments_timbre(songH5File))
    #array2d float
    song.segmentsPitches_mean, song.segmentsPitches_var = covert_2darray_to_meanvar(
        hdf5_getters.get_segments_pitches(songH5File))

    #------------------------array string attributes------------------------
    song.similarArtists = convert_array_to_string(
        hdf5_getters.get_similar_artists(songH5File))  #array string
    song.artistTerms = convert_array_to_string(
        hdf5_getters.get_artist_terms(songH5File))  #array string
    song.artistmbTags = convert_array_to_string(
        hdf5_getters.get_artist_mbtags(songH5File))  #array string

    return song
        for name in files:
            if name.endswith(".h5"):
                if process_completion % 2350 == 0:
                    print "done :", process_completion/2350.0, "%"
                    process_completion += 1
                    header_counter += 1
                else:
                    process_completion += 1
                    header_counter += 1

                tempPath = os.path.abspath(os.path.join(root,name))
                h5SongFile = hdf5_getters.open_h5_file_read(tempPath)

                # create the sample row for feature matrix
                # initialize the list with track ID and song ID
                sampleRow = [hdf5_getters.get_track_id(h5SongFile),
                             hdf5_getters.get_song_id(h5SongFile)]

                cnt = 0
                for func in featureFunctions:
                    cnt += 1
                    # print "functionNumber: ", cnt
                    # print "BarStrctr: ", hdf5_getters.get_bars_start(h5SongFile)
                    sampleRow += func(h5SongFile)

                # Construct the feature matrix that contains all the songs...
                # Data structure for featureMatrix will be list of lists, so that...
                # Numpy will be able to convert it to a matrix in a way that...
                # All list elements will become rows
                # featureMatrix += [sampleRow]
                featureMatrix = [sampleRow]
 artist_long = g.get_artist_longitude(h5)
 artist_loc = g.get_artist_location(h5)
 song_idss = g.get_song_id(h5)
 song_speed = g.get_tempo(h5)
 song_bar = g.get_bars_start(h5)
 song_beat = g.get_beats_start(h5)
 song_time_signature = g.get_time_signature(h5)
 song_tat = g.get_tatums_start(h5)
 song_mode = g.get_mode(h5)
 song_key = g.get_key(h5)
 song_idss = g.get_song_id(h5)
 song_title = g.get_title(h5)
 song_duration = g.get_duration(h5)
 song_release_years = g.get_year(h5)
 song_hot = g.get_song_hotttnesss (h5)
 track_idss = g.get_track_id(h5)
 
 file_path.append(filepath)
 artist_names.append(artist_name)
 artist_familiarty.append(artist_familar)
 artist_hotness.append(artist_hot)
 artist_id.append(artist_ids)
 artist_latitude.append(artist_lat)
 artist_longitude.append(artist_long)
 artist_location.append(artist_loc)
 song_id.append(song_idss)  
 song_tempo.append(song_speed)
 song_bars.append(song_bar)
 song_beats.append(song_beat)
 song_time_signatures.append(song_time_signature)
 song_tatum.append(song_tat)
def data_to_flat_file(basedir, ext='.h5'):
    """This function extract the information from the tables and creates the flat file."""
    count = 0
    #song counter
    list_to_write = []
    row_to_write = ""
    writer = csv.writer(open("metadata_wholeA.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print f  #the name of the file
            h5 = hdf5_getters.open_h5_file_read(f)
            title = hdf5_getters.get_title(h5)
            title = title.replace('"', '')
            comma = title.find(',')  #eliminating commas in the title
            if comma != -1:
                print title
                time.sleep(1)
            album = hdf5_getters.get_release(h5)
            album = album.replace('"', '')  #eliminating commas in the album
            comma = album.find(',')
            if comma != -1:
                print album
                time.sleep(1)
            artist_name = hdf5_getters.get_artist_name(h5)
            comma = artist_name.find(',')
            if comma != -1:
                print artist_name
                time.sleep(1)
            artist_name = artist_name.replace('"',
                                              '')  #eliminating double quotes
            duration = hdf5_getters.get_duration(h5)
            samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
            artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
            artist_fam = hdf5_getters.get_artist_familiarity(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_fam) == True:
                artist_fam = -1
            artist_hotness = hdf5_getters.get_artist_hotttnesss(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_hotness) == True:
                artist_hotness = -1
            artist_id = hdf5_getters.get_artist_id(h5)
            artist_lat = hdf5_getters.get_artist_latitude(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_lat) == True:
                artist_lat = -1
            artist_loc = hdf5_getters.get_artist_location(h5)
            #checks artist_loc to see if it is a hyperlink if it is set as empty string
            artist_loc = artist_loc.replace(",", "\,")
            if artist_loc.startswith("<a"):
                artist_loc = ""
            if len(artist_loc) > 100:
                artist_loc = ""
            artist_lon = hdf5_getters.get_artist_longitude(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(artist_lon) == True:
                artist_lon = -1
            artist_mbid = hdf5_getters.get_artist_mbid(h5)
            artist_pmid = hdf5_getters.get_artist_playmeid(h5)
            audio_md5 = hdf5_getters.get_audio_md5(h5)
            danceability = hdf5_getters.get_danceability(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(danceability) == True:
                danceability = -1
            end_fade_in = hdf5_getters.get_end_of_fade_in(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(end_fade_in) == True:
                end_fade_in = -1
            energy = hdf5_getters.get_energy(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(energy) == True:
                energy = -1
            song_key = hdf5_getters.get_key(h5)
            key_c = hdf5_getters.get_key_confidence(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(key_c) == True:
                key_c = -1
            loudness = hdf5_getters.get_loudness(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(loudness) == True:
                loudness = -1
            mode = hdf5_getters.get_mode(h5)
            mode_conf = hdf5_getters.get_mode_confidence(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(mode_conf) == True:
                mode_conf = -1
            release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
            song_hot = hdf5_getters.get_song_hotttnesss(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(song_hot) == True:
                song_hot = -1
            song_id = hdf5_getters.get_song_id(h5)
            start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
            tempo = hdf5_getters.get_tempo(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(tempo) == True:
                tempo = -1
            time_sig = hdf5_getters.get_time_signature(h5)
            time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
            #checking if we get a "nan" if we do we change it to -1
            if numpy.isnan(time_sig_c) == True:
                time_sig_c = -1
            track_id = hdf5_getters.get_track_id(h5)
            track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
            year = hdf5_getters.get_year(h5)
            bars_c = hdf5_getters.get_bars_confidence(h5)
            bars_c_avg = get_avg(bars_c)
            bars_c_max = get_max(bars_c)
            bars_c_min = get_min(bars_c)
            bars_c_stddev = get_stddev(bars_c)
            bars_c_count = get_count(bars_c)
            bars_c_sum = get_sum(bars_c)
            bars_start = hdf5_getters.get_bars_start(h5)
            bars_start_avg = get_avg(bars_start)
            bars_start_max = get_max(bars_start)
            bars_start_min = get_min(bars_start)
            bars_start_stddev = get_stddev(bars_start)
            bars_start_count = get_count(bars_start)
            bars_start_sum = get_sum(bars_start)
            beats_c = hdf5_getters.get_beats_confidence(h5)
            beats_c_avg = get_avg(beats_c)
            beats_c_max = get_max(beats_c)
            beats_c_min = get_min(beats_c)
            beats_c_stddev = get_stddev(beats_c)
            beats_c_count = get_count(beats_c)
            beats_c_sum = get_sum(beats_c)
            beats_start = hdf5_getters.get_beats_start(h5)
            beats_start_avg = get_avg(beats_start)
            beats_start_max = get_max(beats_start)
            beats_start_min = get_min(beats_start)
            beats_start_stddev = get_stddev(beats_start)
            beats_start_count = get_count(beats_start)
            beats_start_sum = get_sum(beats_start)
            sec_c = hdf5_getters.get_sections_confidence(h5)
            sec_c_avg = get_avg(sec_c)
            sec_c_max = get_max(sec_c)
            sec_c_min = get_min(sec_c)
            sec_c_stddev = get_stddev(sec_c)
            sec_c_count = get_count(sec_c)
            sec_c_sum = get_sum(sec_c)
            sec_start = hdf5_getters.get_sections_start(h5)
            sec_start_avg = get_avg(sec_start)
            sec_start_max = get_max(sec_start)
            sec_start_min = get_min(sec_start)
            sec_start_stddev = get_stddev(sec_start)
            sec_start_count = get_count(sec_start)
            sec_start_sum = get_sum(sec_start)
            seg_c = hdf5_getters.get_segments_confidence(h5)
            seg_c_avg = get_avg(seg_c)
            seg_c_max = get_max(seg_c)
            seg_c_min = get_min(seg_c)
            seg_c_stddev = get_stddev(seg_c)
            seg_c_count = get_count(seg_c)
            seg_c_sum = get_sum(seg_c)
            seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
            seg_loud_max_avg = get_avg(seg_loud_max)
            seg_loud_max_max = get_max(seg_loud_max)
            seg_loud_max_min = get_min(seg_loud_max)
            seg_loud_max_stddev = get_stddev(seg_loud_max)
            seg_loud_max_count = get_count(seg_loud_max)
            seg_loud_max_sum = get_sum(seg_loud_max)
            seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
            seg_loud_max_time_avg = get_avg(seg_loud_max_time)
            seg_loud_max_time_max = get_max(seg_loud_max_time)
            seg_loud_max_time_min = get_min(seg_loud_max_time)
            seg_loud_max_time_stddev = get_stddev(seg_loud_max_time)
            seg_loud_max_time_count = get_count(seg_loud_max_time)
            seg_loud_max_time_sum = get_sum(seg_loud_max_time)
            seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
            seg_loud_start_avg = get_avg(seg_loud_start)
            seg_loud_start_max = get_max(seg_loud_start)
            seg_loud_start_min = get_min(seg_loud_start)
            seg_loud_start_stddev = get_stddev(seg_loud_start)
            seg_loud_start_count = get_count(seg_loud_start)
            seg_loud_start_sum = get_sum(seg_loud_start)
            seg_pitch = hdf5_getters.get_segments_pitches(h5)
            pitch_size = len(seg_pitch)
            seg_start = hdf5_getters.get_segments_start(h5)
            seg_start_avg = get_avg(seg_start)
            seg_start_max = get_max(seg_start)
            seg_start_min = get_min(seg_start)
            seg_start_stddev = get_stddev(seg_start)
            seg_start_count = get_count(seg_start)
            seg_start_sum = get_sum(seg_start)
            seg_timbre = hdf5_getters.get_segments_timbre(h5)
            tatms_c = hdf5_getters.get_tatums_confidence(h5)
            tatms_c_avg = get_avg(tatms_c)
            tatms_c_max = get_max(tatms_c)
            tatms_c_min = get_min(tatms_c)
            tatms_c_stddev = get_stddev(tatms_c)
            tatms_c_count = get_count(tatms_c)
            tatms_c_sum = get_sum(tatms_c)
            tatms_start = hdf5_getters.get_tatums_start(h5)
            tatms_start_avg = get_avg(tatms_start)
            tatms_start_max = get_max(tatms_start)
            tatms_start_min = get_min(tatms_start)
            tatms_start_stddev = get_stddev(tatms_start)
            tatms_start_count = get_count(tatms_start)
            tatms_start_sum = get_sum(tatms_start)

            #Getting the genres
            genre_set = 0  #flag to see if the genre has been set or not
            art_trm = hdf5_getters.get_artist_terms(h5)
            trm_freq = hdf5_getters.get_artist_terms_freq(h5)
            trn_wght = hdf5_getters.get_artist_terms_weight(h5)
            a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
            genre_indexes = get_genre_indexes(
                trm_freq)  #index of the highest freq
            final_genre = []
            genres_so_far = []
            for i in range(len(genre_indexes)):
                genre_tmp = get_genre(
                    art_trm, genre_indexes[i]
                )  #genre that corresponds to the highest freq
                genres_so_far = genre_dict.get_genre_in_dict(
                    genre_tmp)  #getting the genre from the dictionary
                if len(genres_so_far) != 0:
                    for i in genres_so_far:
                        final_genre.append(i)
                        genre_set = 1  #genre was found in dictionary

            if genre_set == 1:
                col_num = []

                for genre in final_genre:
                    column = int(
                        genre)  #getting the column number of the genre
                    col_num.append(column)

                genre_array = genre_columns(col_num)  #genre array
            else:
                genre_array = genre_columns(
                    -1)  #the genre was not found in the dictionary

            transpose_pitch = seg_pitch.transpose(
            )  #this is to tranpose the matrix,so we can have 12 rows
            #arrays containing the aggregate values of the 12 rows
            seg_pitch_avg = []
            seg_pitch_max = []
            seg_pitch_min = []
            seg_pitch_stddev = []
            seg_pitch_count = []
            seg_pitch_sum = []
            i = 0
            #Getting the aggregate values in the pitches array
            for row in transpose_pitch:
                seg_pitch_avg.append(get_avg(row))
                seg_pitch_max.append(get_max(row))
                seg_pitch_min.append(get_min(row))
                seg_pitch_stddev.append(get_stddev(row))
                seg_pitch_count.append(get_count(row))
                seg_pitch_sum.append(get_sum(row))
                i = i + 1

            #extracting information from the timbre array
            transpose_timbre = seg_pitch.transpose(
            )  #tranposing matrix, to have 12 rows
            #arrays containing the aggregate values of the 12 rows
            seg_timbre_avg = []
            seg_timbre_max = []
            seg_timbre_min = []
            seg_timbre_stddev = []
            seg_timbre_count = []
            seg_timbre_sum = []
            i = 0
            for row in transpose_timbre:
                seg_timbre_avg.append(get_avg(row))
                seg_timbre_max.append(get_max(row))
                seg_timbre_min.append(get_min(row))
                seg_timbre_stddev.append(get_stddev(row))
                seg_timbre_count.append(get_count(row))
                seg_timbre_sum.append(get_sum(row))
                i = i + 1

        #Writing to the flat file
            writer.writerow([
                title, album, artist_name, year, duration, seg_start_count,
                tempo
            ])

            h5.close()
            count = count + 1
            print count
Esempio n. 32
0
import sys
sys.path.append('/home/gaurav/PythonDir/ProjectWork/MSongsDB-master/PythonSrc')
import hdf5_getters
sumfile = '/home/gaurav/PythonDir/ProjectWork/MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5'

h5 = hdf5_getters.open_h5_file_read(sumfile)
for k in range(10000):
    #a_name = hdf5_getters.get_artist_name(h5,k)
    hot1 = hdf5_getters.get_song_hotttnesss(h5, k)
    #if a_name == 'Radiohead':
    if hot1:
        print hdf5_getters.get_track_id(h5, k), hot1

#Faster
#h5 = hdf5_getters.open_h5_file_read(sumfile)
#idxs = h5.root.metadata.songs.getWhereList('artist_name=="Radiohead"')
#for idx in idxs:
#    print h5.root.analysis.songs.cols.track_id[idx]
Esempio n. 33
0
            mbtags = h.get_artist_mbtags(h5,0)

            for term in terms:
                term = term.replace("'","")
                cursor.execute("SELECT * FROM artist_genres WHERE artist_id='" + artist_id + "' AND genre ='" + term + "'")
                if cursor.rowcount != 1:
                    cursor.execute("INSERT INTO artist_genres VALUES ('" + artist_id + "','" + term + "')")
            for tag in mbtags:
                tag = tag.replace("'","")
                cursor.execute("SELECT * FROM artist_genres WHERE artist_id='" + artist_id + "' AND genre ='" + tag + "'")
                if cursor.rowcount != 1:
                    cursor.execute("INSERT INTO artist_genres VALUES ('" + artist_id + "','" + tag + "')")

            ''' Store track tuples '''

            track_id = h.get_track_id(h5,0)
            track_title = h.get_title(h5,0)
            track_title = track_title.replace("'","")
            track_album = h.get_release(h5,0)
            track_album = track_album.replace("'","")
            track_duration = str(h.get_duration(h5,0))
            track_year = str(h.get_year(h5,0))

            cursor.execute("SELECT * FROM track WHERE track_id = '" + track_id  + "'")
            rs = cursor.fetchall()
            if cursor.rowcount != 1:
                cursor.execute("INSERT INTO track VALUES ('" + track_id + "','" + track_title + "','" + artist_id  + "','"  + artist_name + "','" + track_album + "'," + track_duration + "," + track_year  + ");")
                      
            ''' Store track_analysis tuples '''
            print ("Track ID: " + h.get_track_id(h5,0))
            track_tempo = str(h.get_tempo(h5,0))
sys.path.append("/mnt/MSongsDB/PythonSrc")  # this is where hdf5_getters.py is placed
import hdf5_getters

# This script converts the summary H5 files only 300MB to a csv file
# Run only on the Master Node since h5_getters cannot open a remote(ie. HDFS) file

if __name__ == "__main__":

    with open("fields.csv", "wb") as f:
        writer = csv.writer(f)  # initialize the csv writer

        # for each track in the summary file, get the 11 fields and output to csv
        h5_file = hdf5_getters.open_h5_file_read("msd_summary_file.h5")
        for k in range(1000000):
            print "index!!!: ", k
            id = hdf5_getters.get_track_id(h5_file, k)  # get track_id TRA13e39..
            title = hdf5_getters.get_title(h5_file, k)  # get song title
            artist_name = hdf5_getters.get_artist_name(h5_file, k)
            year = int(hdf5_getters.get_year(h5_file, k))
            hotness = float(hdf5_getters.get_song_hotttnesss(h5_file, k))
            artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file, k))
            f5 = int(hdf5_getters.get_key(h5_file, k))  # get key
            f2 = float(hdf5_getters.get_loudness(h5_file, k))  # get loudness
            f1 = float(hdf5_getters.get_tempo(h5_file, k))  # get tempo
            f4 = int(hdf5_getters.get_duration(h5_file, k))  # get duration
            f3 = float(hdf5_getters.get_time_signature(h5_file, k))  # get time signature

            # Get rid of missing info and change invalid numbers for meta data

            if not artist_name:
                artist_name = "unknown"
Esempio n. 35
0
def process_filelist_test(filelist=None,model=None,tmpfilename=None,
                           npicks=None,winsize=None,finaldim=None,K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model',model,'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,'year_real',tables.IntAtom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'year_pred',tables.Float64Atom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12 # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress=='cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False,'Unknown type of compression: '+str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #'+str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0: # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj)
        else:
            assert False,'Unknown type of compression: '+str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats,kd,h5model,K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append( [year] )
            output.root.data.year_pred.append( [year_pred] )
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return
Esempio n. 36
0
def process_filelist_train(filelist=None,testartists=None,tmpfilename=None,
                           npicks=None,winsize=None,finaldim=None,typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is not in testartist)
    INPUT
       filelist     - a list of song files
       testartists  - set of artist ID that we should not use
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients),
                      'cov' (covariance)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,finaldim),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'year',tables.IntAtom(shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'track_id',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12 # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False,'Unknown type of compression: '+str(typecompress)
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #',cnt_f
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0 or artist_id in testartists:
            continue
        # we have a train artist with a song year, we're good
        bttimbre = get_bttimbre(f)
        if typecompress == 'picks':
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj)
        else:
            assert False,'Unknown type of compression: '+str(typecompress)
        # save them to tmp file
        n_p_feats = processed_feats.shape[0]
        output.root.data.year.append( np.array( [year] * n_p_feats ) )
        output.root.data.track_id.append( np.array( [track_id] * n_p_feats ) )
        output.root.data.feats.append( processed_feats )
    # we're done, close output
    output.close()
    return
Esempio n. 37
0
def data_to_flat_file(basedir,ext='.h5') :
    """This function extract the information from the tables and creates the flat file."""	
    count = 0;	#song counter
    list_to_write= []
    row_to_write = ""
    writer = csv.writer(open("metadata.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
	    print f	#the name of the file
            h5 = hdf5_getters.open_h5_file_read(f)
	    title = hdf5_getters.get_title(h5) 
	    title= title.replace('"','') 
	    comma=title.find(',')	#eliminating commas in the title
	    if	comma != -1:
		    print title
		    time.sleep(1)
	    album = hdf5_getters.get_release(h5)
	    album= album.replace('"','')	#eliminating commas in the album	
	    comma=album.find(',')
	    if	comma != -1:
		    print album
		    time.sleep(1)
	    artist_name = hdf5_getters.get_artist_name(h5)
	    comma=artist_name.find(',')
	    if	comma != -1:
		    print artist_name
		    time.sleep(1)
	    artist_name= artist_name.replace('"','')	#eliminating double quotes
	    duration = hdf5_getters.get_duration(h5)
	    samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
	    artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
	    artist_fam = hdf5_getters.get_artist_familiarity(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_fam) == True:
	            artist_fam=-1
	    artist_hotness= hdf5_getters.get_artist_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_hotness) == True:
	            artist_hotness=-1
	    artist_id = hdf5_getters.get_artist_id(h5)
	    artist_lat = hdf5_getters.get_artist_latitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lat) == True:
	            artist_lat=-1
	    artist_loc = hdf5_getters.get_artist_location(h5)
		#checks artist_loc to see if it is a hyperlink if it is set as empty string
	    artist_loc = artist_loc.replace(",", "\,");
	    if artist_loc.startswith("<a"):
                artist_loc = ""
	    if len(artist_loc) > 100:
                artist_loc = ""
	    artist_lon = hdf5_getters.get_artist_longitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lon) == True:
	            artist_lon=-1
	    artist_mbid = hdf5_getters.get_artist_mbid(h5)
	    artist_pmid = hdf5_getters.get_artist_playmeid(h5)
	    audio_md5 = hdf5_getters.get_audio_md5(h5)
	    danceability = hdf5_getters.get_danceability(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(danceability) == True:
	            danceability=-1
	    end_fade_in =hdf5_getters.get_end_of_fade_in(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(end_fade_in) == True:
	            end_fade_in=-1
	    energy = hdf5_getters.get_energy(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(energy) == True:
	            energy=-1
            song_key = hdf5_getters.get_key(h5)
	    key_c = hdf5_getters.get_key_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(key_c) == True:
	            key_c=-1
	    loudness = hdf5_getters.get_loudness(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(loudness) == True:
	            loudness=-1
	    mode = hdf5_getters.get_mode(h5)
	    mode_conf = hdf5_getters.get_mode_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(mode_conf) == True:
	            mode_conf=-1
	    release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
	    song_hot = hdf5_getters.get_song_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(song_hot) == True:
	            song_hot=-1
	    song_id = hdf5_getters.get_song_id(h5)
	    start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
	    tempo = hdf5_getters.get_tempo(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(tempo) == True:
	            tempo=-1
	    time_sig = hdf5_getters.get_time_signature(h5)
	    time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(time_sig_c) == True:
	            time_sig_c=-1
	    track_id = hdf5_getters.get_track_id(h5)
	    track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
	    year = hdf5_getters.get_year(h5)
	    bars_c = hdf5_getters.get_bars_confidence(h5)
	    bars_c_avg= get_avg(bars_c)
	    bars_c_max= get_max(bars_c)
	    bars_c_min = get_min(bars_c)
	    bars_c_stddev= get_stddev(bars_c)
	    bars_c_count = get_count(bars_c)
	    bars_c_sum = get_sum(bars_c)
	    bars_start = hdf5_getters.get_bars_start(h5)
	    bars_start_avg = get_avg(bars_start)
	    bars_start_max= get_max(bars_start)
	    bars_start_min = get_min(bars_start)
	    bars_start_stddev= get_stddev(bars_start)
	    bars_start_count = get_count(bars_start)
	    bars_start_sum = get_sum(bars_start)
            beats_c = hdf5_getters.get_beats_confidence(h5)
            beats_c_avg= get_avg(beats_c)
	    beats_c_max= get_max(beats_c)
	    beats_c_min = get_min(beats_c)
	    beats_c_stddev= get_stddev(beats_c)
	    beats_c_count = get_count(beats_c)
	    beats_c_sum = get_sum(beats_c)
            beats_start = hdf5_getters.get_beats_start(h5)
 	    beats_start_avg = get_avg(beats_start)
	    beats_start_max= get_max(beats_start)
	    beats_start_min = get_min(beats_start)
	    beats_start_stddev= get_stddev(beats_start)
	    beats_start_count = get_count(beats_start)
	    beats_start_sum = get_sum(beats_start)
	    sec_c = hdf5_getters.get_sections_confidence(h5)
            sec_c_avg= get_avg(sec_c)
	    sec_c_max= get_max(sec_c)
	    sec_c_min = get_min(sec_c)
	    sec_c_stddev= get_stddev(sec_c)
	    sec_c_count = get_count(sec_c)
	    sec_c_sum = get_sum(sec_c)
	    sec_start = hdf5_getters.get_sections_start(h5)
            sec_start_avg = get_avg(sec_start)
	    sec_start_max= get_max(sec_start)
	    sec_start_min = get_min(sec_start)
	    sec_start_stddev= get_stddev(sec_start)
	    sec_start_count = get_count(sec_start)
	    sec_start_sum = get_sum(sec_start)
	    seg_c = hdf5_getters.get_segments_confidence(h5)
	    seg_c_avg= get_avg(seg_c)
	    seg_c_max= get_max(seg_c)
	    seg_c_min = get_min(seg_c)
	    seg_c_stddev= get_stddev(seg_c)
	    seg_c_count = get_count(seg_c)
	    seg_c_sum = get_sum(seg_c)
            seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
            seg_loud_max_avg= get_avg(seg_loud_max)
	    seg_loud_max_max= get_max(seg_loud_max)
	    seg_loud_max_min = get_min(seg_loud_max)
	    seg_loud_max_stddev= get_stddev(seg_loud_max)
	    seg_loud_max_count = get_count(seg_loud_max)
	    seg_loud_max_sum = get_sum(seg_loud_max)
	    seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
	    seg_loud_max_time_avg= get_avg(seg_loud_max_time)
	    seg_loud_max_time_max= get_max(seg_loud_max_time)
	    seg_loud_max_time_min = get_min(seg_loud_max_time)
	    seg_loud_max_time_stddev= get_stddev(seg_loud_max_time)
	    seg_loud_max_time_count = get_count(seg_loud_max_time)
	    seg_loud_max_time_sum = get_sum(seg_loud_max_time)
	    seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
	    seg_loud_start_avg= get_avg(seg_loud_start)
	    seg_loud_start_max= get_max(seg_loud_start)
	    seg_loud_start_min = get_min(seg_loud_start)
	    seg_loud_start_stddev= get_stddev(seg_loud_start)
	    seg_loud_start_count = get_count(seg_loud_start)
	    seg_loud_start_sum = get_sum(seg_loud_start)					      
	    seg_pitch = hdf5_getters.get_segments_pitches(h5)
	    pitch_size = len(seg_pitch)
	    seg_start = hdf5_getters.get_segments_start(h5)
	    seg_start_avg= get_avg(seg_start)
	    seg_start_max= get_max(seg_start)
	    seg_start_min = get_min(seg_start)
	    seg_start_stddev= get_stddev(seg_start)
	    seg_start_count = get_count(seg_start)
	    seg_start_sum = get_sum(seg_start)
	    seg_timbre = hdf5_getters.get_segments_timbre(h5)
	    tatms_c = hdf5_getters.get_tatums_confidence(h5)
	    tatms_c_avg= get_avg(tatms_c)
	    tatms_c_max= get_max(tatms_c)
	    tatms_c_min = get_min(tatms_c)
	    tatms_c_stddev= get_stddev(tatms_c)
	    tatms_c_count = get_count(tatms_c)
	    tatms_c_sum = get_sum(tatms_c)
	    tatms_start = hdf5_getters.get_tatums_start(h5)
	    tatms_start_avg= get_avg(tatms_start)
	    tatms_start_max= get_max(tatms_start)
	    tatms_start_min = get_min(tatms_start)
	    tatms_start_stddev= get_stddev(tatms_start)
	    tatms_start_count = get_count(tatms_start)
	    tatms_start_sum = get_sum(tatms_start)
	
	    #Getting the genres
	    genre_set = 0    #flag to see if the genre has been set or not
	    art_trm = hdf5_getters.get_artist_terms(h5)
	    trm_freq = hdf5_getters.get_artist_terms_freq(h5)
	    trn_wght = hdf5_getters.get_artist_terms_weight(h5)
	    a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
	    genre_indexes=get_genre_indexes(trm_freq) #index of the highest freq
	    final_genre=[]
	    genres_so_far=[]
	    for i in range(len(genre_indexes)):
		    genre_tmp=get_genre(art_trm,genre_indexes[i])   #genre that corresponds to the highest freq
		    genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary
		    if len(genres_so_far) != 0:
			    for i in genres_so_far:
				final_genre.append(i)
				genre_set=1				#genre was found in dictionary
				  
		
	    
	    if genre_set == 1:
		    col_num=[]
		   
		    for genre in final_genre:
			    column=int(genre)				#getting the column number of the genre
			    col_num.append(column)

		    genre_array=genre_columns(col_num)	         #genre array
 	    else:
		    genre_array=genre_columns(-1)		#the genre was not found in the dictionary

	    transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_pitch_avg=[]
	    seg_pitch_max=[]
	    seg_pitch_min=[]
            seg_pitch_stddev=[]
            seg_pitch_count=[]
	    seg_pitch_sum=[]
            i=0
	    #Getting the aggregate values in the pitches array
	    for row in transpose_pitch:
		   seg_pitch_avg.append(get_avg(row))
		   seg_pitch_max.append(get_max(row))
	           seg_pitch_min.append(get_min(row))
		   seg_pitch_stddev.append(get_stddev(row))
		   seg_pitch_count.append(get_count(row))
                   seg_pitch_sum.append(get_sum(row))
		   i=i+1

	    #extracting information from the timbre array 
            transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_timbre_avg=[]
	    seg_timbre_max=[]
	    seg_timbre_min=[]
            seg_timbre_stddev=[]
            seg_timbre_count=[]
	    seg_timbre_sum=[]
            i=0
	    for row in transpose_timbre:
		   seg_timbre_avg.append(get_avg(row))
		   seg_timbre_max.append(get_max(row))
	           seg_timbre_min.append(get_min(row))
		   seg_timbre_stddev.append(get_stddev(row))
		   seg_timbre_count.append(get_count(row))
                   seg_timbre_sum.append(get_sum(row))
		   i=i+1
		


		#Writing to the flat file

            writer.writerow([title,album,artist_name,duration,samp_rt,artist_7digitalid,artist_fam,artist_hotness,artist_id,artist_lat,artist_loc,artist_lon,artist_mbid,genre_array[0],genre_array[1],genre_array[2],
genre_array[3],genre_array[4],genre_array[5],genre_array[6],genre_array[7],genre_array[8],genre_array[9],genre_array[10],genre_array[11],genre_array[12],genre_array[13],genre_array[14],genre_array[15],
genre_array[16],genre_array[17],genre_array[18],genre_array[19],genre_array[20],genre_array[21],genre_array[22],genre_array[23],genre_array[24],genre_array[25],genre_array[26],
genre_array[27],genre_array[28],genre_array[29],genre_array[30],genre_array[31],genre_array[32],genre_array[33],genre_array[34],genre_array[35],genre_array[36],genre_array[37],genre_array[38],
genre_array[39],genre_array[40],genre_array[41],genre_array[42],genre_array[43],genre_array[44],genre_array[45],genre_array[46],genre_array[47],genre_array[48],genre_array[49],
genre_array[50],genre_array[51],genre_array[52],genre_array[53],genre_array[54],genre_array[55],genre_array[56],genre_array[57],genre_array[58],genre_array[59],
genre_array[60],genre_array[61],genre_array[62],genre_array[63],genre_array[64],genre_array[65],genre_array[66],genre_array[67],genre_array[68],genre_array[69],
genre_array[70],genre_array[71],genre_array[72],genre_array[73],genre_array[74],genre_array[75],genre_array[76],genre_array[77],genre_array[78],genre_array[79],
genre_array[80],genre_array[81],genre_array[82],genre_array[83],genre_array[84],genre_array[85],genre_array[86],genre_array[87],genre_array[88],genre_array[89],
genre_array[90],genre_array[91],genre_array[92],genre_array[93],genre_array[94],genre_array[95],genre_array[96],genre_array[97],genre_array[98],genre_array[99],genre_array[100],genre_array[101],
genre_array[102],genre_array[103],genre_array[104],genre_array[105],genre_array[106],genre_array[107],genre_array[108],genre_array[109],genre_array[110],genre_array[111],genre_array[112],
genre_array[113],genre_array[114],genre_array[115],genre_array[116],genre_array[117],genre_array[118],genre_array[119],genre_array[120],genre_array[121],genre_array[122],genre_array[123],
genre_array[124],genre_array[125],genre_array[126],genre_array[127],genre_array[128],genre_array[129],genre_array[130],genre_array[131],genre_array[132],
artist_pmid,audio_md5,danceability,end_fade_in,energy,song_key,key_c,loudness,mode,mode_conf,release_7digitalid,song_hot,song_id,start_fade_out,tempo,time_sig,time_sig_c,track_id,track_7digitalid,year,bars_c_avg,bars_c_max,bars_c_min,bars_c_stddev,bars_c_count,bars_c_sum,bars_start_avg,bars_start_max,bars_start_min,bars_start_stddev,bars_start_count,bars_start_sum,beats_c_avg,beats_c_max,beats_c_min,beats_c_stddev,beats_c_count,beats_c_sum,beats_start_avg,beats_start_max,beats_start_min, beats_start_stddev,beats_start_count,beats_start_sum, sec_c_avg,sec_c_max,sec_c_min,sec_c_stddev,sec_c_count,sec_c_sum,sec_start_avg,sec_start_max,sec_start_min,sec_start_stddev,sec_start_count,sec_start_sum,seg_c_avg,seg_c_max,seg_c_min,seg_c_stddev,seg_c_count,seg_c_sum,seg_loud_max_avg,seg_loud_max_max,seg_loud_max_min,seg_loud_max_stddev,seg_loud_max_count,seg_loud_max_sum,seg_loud_max_time_avg,seg_loud_max_time_max,seg_loud_max_time_min,seg_loud_max_time_stddev,seg_loud_max_time_count,seg_loud_max_time_sum,seg_loud_start_avg,seg_loud_start_max,seg_loud_start_min,seg_loud_start_stddev,seg_loud_start_count,seg_loud_start_sum,seg_pitch_avg[0],seg_pitch_max[0],seg_pitch_min[0],seg_pitch_stddev[0],seg_pitch_count[0],seg_pitch_sum[0],seg_pitch_avg[1],seg_pitch_max[1],seg_pitch_min[1],seg_pitch_stddev[1],seg_pitch_count[1],seg_pitch_sum[1],seg_pitch_avg[2],seg_pitch_max[2],seg_pitch_min[2],seg_pitch_stddev[2],seg_pitch_count[2],seg_pitch_sum[2],seg_pitch_avg[3],seg_pitch_max[3],seg_pitch_min[3],seg_pitch_stddev[3],seg_pitch_count[3],seg_pitch_sum[3],seg_pitch_avg[4],seg_pitch_max[4],seg_pitch_min[4],seg_pitch_stddev[4],seg_pitch_count[4],seg_pitch_sum[4],seg_pitch_avg[5],seg_pitch_max[5],seg_pitch_min[5],seg_pitch_stddev[5],seg_pitch_count[5],seg_pitch_sum[5],seg_pitch_avg[6],seg_pitch_max[6],seg_pitch_min[6],seg_pitch_stddev[6],seg_pitch_count[6],seg_pitch_sum[6],seg_pitch_avg[7],seg_pitch_max[7],seg_pitch_min[7],seg_pitch_stddev[7],seg_pitch_count[7],seg_pitch_sum[7],seg_pitch_avg[8],seg_pitch_max[8],seg_pitch_min[8],seg_pitch_stddev[8],seg_pitch_count[8],seg_pitch_sum[8],seg_pitch_avg[9],seg_pitch_max[9],seg_pitch_min[9],seg_pitch_stddev[9],seg_pitch_count[9],seg_pitch_sum[9],seg_pitch_avg[10],seg_pitch_max[10],seg_pitch_min[10],seg_pitch_stddev[10],seg_pitch_count[10],seg_pitch_sum[10],seg_pitch_avg[11],seg_pitch_max[11],seg_pitch_min[11],
seg_pitch_stddev[11],seg_pitch_count[11],seg_pitch_sum[11],seg_start_avg,seg_start_max,seg_start_min,seg_start_stddev, 
seg_start_count,seg_start_sum,seg_timbre_avg[0],seg_timbre_max[0],seg_timbre_min[0],seg_timbre_stddev[0],seg_timbre_count[0],
seg_timbre_sum[0],seg_timbre_avg[1],seg_timbre_max[1],seg_timbre_min[1],seg_timbre_stddev[1],seg_timbre_count[1],
seg_timbre_sum[1],seg_timbre_avg[2],seg_timbre_max[2],seg_timbre_min[2],seg_timbre_stddev[2],seg_timbre_count[2],
seg_timbre_sum[2],seg_timbre_avg[3],seg_timbre_max[3],seg_timbre_min[3],seg_timbre_stddev[3],seg_timbre_count[3],
seg_timbre_sum[3],seg_timbre_avg[4],seg_timbre_max[4],seg_timbre_min[4],seg_timbre_stddev[4],seg_timbre_count[4],
seg_timbre_sum[4],seg_timbre_avg[5],seg_timbre_max[5],seg_timbre_min[5],seg_timbre_stddev[5],seg_timbre_count[5],
seg_timbre_sum[5],seg_timbre_avg[6],seg_timbre_max[6],seg_timbre_min[6],seg_timbre_stddev[6],seg_timbre_count[6],
seg_timbre_sum[6],seg_timbre_avg[7],seg_timbre_max[7],seg_timbre_min[7],seg_timbre_stddev[7],seg_timbre_count[7],
seg_timbre_sum[7],seg_timbre_avg[8],seg_timbre_max[8],seg_timbre_min[8],seg_timbre_stddev[8],seg_timbre_count[8],
seg_timbre_sum[8],seg_timbre_avg[9],seg_timbre_max[9],seg_timbre_min[9],seg_timbre_stddev[9],seg_timbre_count[9],
seg_timbre_sum[9],seg_timbre_avg[10],seg_timbre_max[10],seg_timbre_min[10],seg_timbre_stddev[10],seg_timbre_count[10],
seg_timbre_sum[10],seg_timbre_avg[11],seg_timbre_max[11],seg_timbre_min[11],seg_timbre_stddev[11],seg_timbre_count[11],
seg_timbre_sum[11],tatms_c_avg,tatms_c_max,tatms_c_min,tatms_c_stddev,tatms_c_count,tatms_c_sum,tatms_start_avg,tatms_start_max,tatms_start_min,tatms_start_stddev,tatms_start_count,tatms_start_sum])






	    h5.close()
	    count=count+1;
	    print count;
Esempio n. 38
0
def process_filelist_test(filelist=None,model=None,tmpfilename=None,K=1):
    """
    Main function, process all files in the list (as long as their track_id
    is not in testsongs)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and artist_id for all train songs
       tmpfilename  - where to save our processed features
       K            - K-nn parameter (default=1)
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None,'process_filelist_train, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file',tmpfilename,'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model',model,'does not exist.'
        return
    # dimension fixed (12-dimensional timbre vector)
    ndim = 12
    finaldim = 90
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/",'data','TMP FILE FOR ARTIST RECOGNITION')
    output.createEArray(group,'artist_id_real',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    output.createEArray(group,'artist_id_pred',tables.StringAtom(18,shape=()),(0,),'',
                        expectedrows=len(filelist))
    # iterate over files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        # verbose
        if cnt_f % 50000 == 0:
            print 'training... checking file #',cnt_f
        # check what file/song is this
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        track_id = GETTERS.get_track_id(h5)
        if track_id in testsongs: # just in case, but should not be necessary
            print 'Found test track_id during training? weird.',track_id
            h5.close()
            continue
        # extract features, then close file
        processed_feats = compute_features(h5)
        h5.close()
        if processed_feats is None:
            continue
        # do prediction
        artist_id_pred = do_prediction(processed_feats,kd,h5model,K)
        # save features to tmp file
        output.root.data.artist_id_real.append( np.array( [artist_id] ) )
        output.root.data.artist_id_pred.append( np.array( [artist_id_pred] ) )
    # we're done, close output
    output.close()
    return
Esempio n. 39
0
def write_line(output, getter, data, song_nb):
    line = hdf5_getters.get_track_id(data).decode('UTF-8')
    h5getter = getattr(hdf5_getters, getter)
    line += '\t' + toJson(h5getter(data, song_nb))
    line += '\n'
    output.write(line)
Esempio n. 40
0
            #segments_loudness_start = ','.join(str(e) for e in GETTERS.get_segments_loudness_start(h5, i)) # array
            #segments_pitches = ','.join(str(e) for e in GETTERS.get_segments_pitches(h5, i)) # array
            #segments_start = ','.join(str(e) for e in GETTERS.get_segments_start(h5, i)) # array
            #segments_timbre = ','.join(str(e) for e in GETTERS.get_segments_timbre(h5, i)) # array
            similar_artists = ','.join(str(e) for e in GETTERS.get_similar_artists(h5, i)) # array
            song_hotttnesss = GETTERS.get_song_hotttnesss(h5, i)
            song_id = GETTERS.get_song_id(h5, i)
            start_of_fade_out = GETTERS.get_start_of_fade_out(h5, i)
            #tatums_confidence = ','.join(str(e) for e in GETTERS.get_tatums_confidence(h5, i)) # array
            #tatums_start = ','.join(str(e) for e in GETTERS.get_tatums_start(h5, i)) # array
            tempo = GETTERS.get_tempo(h5, i)
            time_signature = GETTERS.get_time_signature(h5, i)
            time_signature_confidence = GETTERS.get_time_signature_confidence(h5, i)
            title = GETTERS.get_title(h5, i)
            track_7digitalid = GETTERS.get_track_7digitalid(h5, i)
            track_id = GETTERS.get_track_id(h5, i)
            year = GETTERS.get_year(h5, i)
            loops += 1




            #row = {'analysis_sample_rate':analysis_sample_rate,'artist_7digitalid':artist_7digitalid,'artist_familiarity':artist_familiarity,'artist_hotttnesss':artist_hotttnesss,'artist_id':artist_id,'artist_latitude':artist_latitude,'artist_location':artist_location,'artist_longitude':artist_longitude,'artist_mbid':artist_mbid,'artist_mbtags_count':artist_mbtags_count,'artist_mbtags':artist_mbtags,'artist_name':artist_name,'artist_terms_freq':artist_terms_freq,'artist_terms_weight':artist_terms_weight,'artist_terms':artist_terms,'audio_md5':audio_md5,'bars_confidence':bars_confidence,'bars_start':bars_start,'beats_confidence':beats_confidence,'beats_start':beats_start,'danceability':danceability,'duration':duration,'end_of_fade_in':end_of_fade_in,'energy':energy,'key_confidence':key_confidence,'key':key,'loudness':loudness,'mode_confidence':mode_confidence,'mode':mode,'release_7digitalid':release_7digitalid,'release':release,'sections_confidence':sections_confidence,'sections_start':sections_start,'segments_confidence':segments_confidence,'segments_loudness_max_time':segments_loudness_max_time,'segments_loudness_max':segments_loudness_max,'segments_loudness_start':segments_loudness_start,'segments_pitches':segments_pitches,'segments_start':segments_start,'segments_timbre':segments_timbre,'similar_artists':similar_artists,'song_hotttnesss':song_hotttnesss,'song_id':song_id,'start_of_fade_out':start_of_fade_out,'tatums_confidence':tatums_confidence,'tatums_start':tatums_start,'tempo':tempo,'time_signature_confidence':time_signature_confidence,'time_signature':time_signature,'title':title,'track_7digitalid':track_7digitalid,'track_id':track_id,'year':year,}
            row = {'analysis_sample_rate':analysis_sample_rate,'artist_7digitalid':artist_7digitalid,'artist_familiarity':artist_familiarity,'artist_hotttnesss':artist_hotttnesss,'artist_id':artist_id,'artist_latitude':artist_latitude,'artist_location':artist_location,'artist_longitude':artist_longitude,'artist_mbid':artist_mbid,'artist_mbtags_count':artist_mbtags_count,'artist_mbtags':artist_mbtags,'artist_name':artist_name,'artist_terms':artist_terms,'danceability':danceability,'duration':duration,'end_of_fade_in':end_of_fade_in,'energy':energy,'key_confidence':key_confidence,'key':key,'loudness':loudness,'mode_confidence':mode_confidence,'mode':mode,'release_7digitalid':release_7digitalid,'release':release,'similar_artists':similar_artists,'song_hotttnesss':song_hotttnesss,'song_id':song_id,'start_of_fade_out':start_of_fade_out,'tempo':tempo,'time_signature_confidence':time_signature_confidence,'time_signature':time_signature,'title':title,'track_7digitalid':track_7digitalid,'track_id':track_id,'year':year,
            }
            key = (row['track_id'])

            #for k, v in row.items(): print k, type(v)
            cf.insert(key,row)
            
            #print "Artist: %s\nSong: %s\nDuration: %s\nYear: %s\nHotness: %s\nLoudness: %s\nDanceability:%s\n\n" % (artist_name, title, duration, year, song_hotttnesss, loudness, danceability)
def data_to_flat_file(basedir,ext='.h5') :
    """ This function extracts the information from the tables and creates the flat file. """
    count = 0; #song counter
    list_to_write= []
    group_index=0
    row_to_write = ""
    writer = csv.writer(open("complete.csv", "wb"))
    for root, dirs, files in os.walk(basedir):
	files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
	    row=[]
	    print f
            h5 = hdf5_getters.open_h5_file_read(f)
	    title = hdf5_getters.get_title(h5) 
	    title= title.replace('"','') 
            row.append(title)
	    comma=title.find(',')
	    if	comma != -1:
		    print title
		    time.sleep(1)
	    album = hdf5_getters.get_release(h5)
	    album= album.replace('"','')
            row.append(album)
	    comma=album.find(',')
	    if	comma != -1:
		    print album
		    time.sleep(1)
	    artist_name = hdf5_getters.get_artist_name(h5)
	    comma=artist_name.find(',')
	    if	comma != -1:
		    print artist_name
		    time.sleep(1)
	    artist_name= artist_name.replace('"','')
            row.append(artist_name)
	    duration = hdf5_getters.get_duration(h5)
            row.append(duration)
	    samp_rt = hdf5_getters.get_analysis_sample_rate(h5)
            row.append(samp_rt)
	    artist_7digitalid = hdf5_getters.get_artist_7digitalid(h5)
            row.append(artist_7digitalid)
	    artist_fam = hdf5_getters.get_artist_familiarity(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_fam) == True:
	            artist_fam=-1
            row.append(artist_fam)
	    artist_hotness= hdf5_getters.get_artist_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_hotness) == True:
	             artist_hotness=-1
            row.append(artist_hotness)
	    artist_id = hdf5_getters.get_artist_id(h5)
            row.append(artist_id)           
	    artist_lat = hdf5_getters.get_artist_latitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lat) == True:
	            artist_lat=-1
            row.append(artist_lat)
	    artist_loc = hdf5_getters.get_artist_location(h5)
            row.append(artist_loc)
	    artist_lon = hdf5_getters.get_artist_longitude(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(artist_lon) == True:
	            artist_lon=-1
            row.append(artist_lon)
	    artist_mbid = hdf5_getters.get_artist_mbid(h5)
            row.append(artist_mbid)

	    #Getting the genre				       
            art_trm = hdf5_getters.get_artist_terms(h5)
            trm_freq = hdf5_getters.get_artist_terms_freq(h5)
	    trn_wght = hdf5_getters.get_artist_terms_weight(h5)
	    a_mb_tags = hdf5_getters.get_artist_mbtags(h5)
	    genre_indexes=get_genre_indexes(trm_freq) 		    #index of the highest freq
	    genre_set=0					            #flag to see if the genre has been set or not
	    final_genre=[]
	    genres_so_far=[]
	    for i in range(len(genre_indexes)):
		    genre_tmp=get_genre(art_trm,genre_indexes[i])   #genre that corresponds to the highest freq
		    genres_so_far=genre_dict.get_genre_in_dict(genre_tmp) #getting the genre from the dictionary
		    if len(genres_so_far) != 0:
			for i in genres_so_far:
				final_genre.append(i)
			    	genre_set=1
			
			
	    if genre_set == 1:
		col_num=[]
		for i in final_genre:
			column=int(i)				#getting the column number of the genre
			col_num.append(column)
	
		genre_array=genre_columns(col_num)	                #genre array 
	        for i in range(len(genre_array)):                   	#appending the genre_array to the row 
			row.append(genre_array[i])
	    else:
		genre_array=genre_columns(-1)				#when there is no genre matched, return an array of [0...0]
	        for i in range(len(genre_array)):                   	#appending the genre_array to the row 
			row.append(genre_array[i])
					

	    artist_pmid = hdf5_getters.get_artist_playmeid(h5)
            row.append(artist_pmid)
	    audio_md5 = hdf5_getters.get_audio_md5(h5)
            row.append(audio_md5)
	    danceability = hdf5_getters.get_danceability(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(danceability) == True:
	            danceability=-1
            row.append(danceability)
	    end_fade_in =hdf5_getters.get_end_of_fade_in(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(end_fade_in) == True:
	            end_fade_in=-1
            row.append(end_fade_in)
	    energy = hdf5_getters.get_energy(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(energy) == True:
	            energy=-1
            row.append(energy)
            song_key = hdf5_getters.get_key(h5)
            row.append(song_key)
	    key_c = hdf5_getters.get_key_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(key_c) == True:
	            key_c=-1
            row.append(key_c)
	    loudness = hdf5_getters.get_loudness(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(loudness) == True:
	            loudness=-1
            row.append(loudness)
	    mode = hdf5_getters.get_mode(h5)
            row.append(mode)
	    mode_conf = hdf5_getters.get_mode_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(mode_conf) == True:
	            mode_conf=-1
            row.append(mode_conf)
	    release_7digitalid = hdf5_getters.get_release_7digitalid(h5)
            row.append(release_7digitalid)
	    song_hot = hdf5_getters.get_song_hotttnesss(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(song_hot) == True:
	            song_hot=-1
            row.append(song_hot)
	    song_id = hdf5_getters.get_song_id(h5)
            row.append(song_id)
	    start_fade_out = hdf5_getters.get_start_of_fade_out(h5)
            row.append(start_fade_out)
	    tempo = hdf5_getters.get_tempo(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(tempo) == True:
	            tempo=-1
            row.append(tempo)
	    time_sig = hdf5_getters.get_time_signature(h5)
            row.append(time_sig)
	    time_sig_c = hdf5_getters.get_time_signature_confidence(h5)
	    #checking if we get a "nan" if we do we change it to -1
	    if numpy.isnan(time_sig_c) == True:
	            time_sig_c=-1
            row.append(time_sig_c)
	    track_id = hdf5_getters.get_track_id(h5)
            row.append(track_id)
	    track_7digitalid = hdf5_getters.get_track_7digitalid(h5)
            row.append(track_7digitalid)
	    year = hdf5_getters.get_year(h5)
            row.append(year)
	    bars_c = hdf5_getters.get_bars_confidence(h5)
            bars_start = hdf5_getters.get_bars_start(h5)
	    row_bars_padding=padding(245)   #this is the array that will be attached at the end of th row

	    #--------------bars---------------"
	    gral_info=[]
	    gral_info=row[:]
	    empty=[]
	    for i,item in enumerate(bars_c):
                row.append(group_index)
                row.append(i)
                row.append(bars_c[i])
	        bars_c_avg= get_avg(bars_c)
                row.append(bars_c_avg)
	        bars_c_max= get_max(bars_c)	
                row.append(bars_c_max)
	        bars_c_min = get_min(bars_c)
                row.append(bars_c_min)
	        bars_c_stddev= get_stddev(bars_c)
                row.append(bars_c_stddev)
	        bars_c_count = get_count(bars_c)
                row.append(bars_c_count)
	        bars_c_sum = get_sum(bars_c)
                row.append(bars_c_sum)
                row.append(bars_start[i])	         
	        bars_start_avg = get_avg(bars_start)
                row.append(bars_start_avg)	         
	        bars_start_max= get_max(bars_start)
                row.append(bars_start_max)	         
	        bars_start_min = get_min(bars_start)
                row.append(bars_start_min)	         
	        bars_start_stddev= get_stddev(bars_start)
                row.append(bars_start_stddev)	         
	        bars_start_count = get_count(bars_start)
                row.append(bars_start_count)	         
	        bars_start_sum = get_sum(bars_start)
                row.append(bars_start_sum)	         
		for i in row_bars_padding:
			row.append(i)

                writer.writerow(row)
		row=[]
		row=gral_info[:]
	 

            #--------beats---------------"
	    beats_c = hdf5_getters.get_beats_confidence(h5)
	    group_index=1
	    row=[]
	    row=gral_info[:]
	    row_front=padding(14)  	#blanks left in front of the row(empty spaces for bars)
	    row_beats_padding=padding(231)
	    for i,item in enumerate(beats_c):
	   	row.append(group_index)
		row.append(i)
		for index in row_front:  #padding blanks in front of the beats
			row.append(index)
		
		row.append(beats_c[i])
	        beats_c_avg= get_avg(beats_c)
		row.append(beats_c_avg)
	        beats_c_max= get_max(beats_c)
		row.append(beats_c_max)
                beats_c_min = get_min(beats_c)
		row.append(beats_c_min)
	        beats_c_stddev= get_stddev(beats_c)
		row.append(beats_c_stddev)
	        beats_c_count = get_count(beats_c)
		row.append(beats_c_count)
	        beats_c_sum = get_sum(beats_c)
		row.append(beats_c_sum)
                beats_start = hdf5_getters.get_beats_start(h5)
		row.append(beats_start[i])
 	        beats_start_avg = get_avg(beats_start)
		row.append(beats_start_avg)
	        beats_start_max= get_max(beats_start)
		row.append(beats_start_max)
	        beats_start_min = get_min(beats_start)
		row.append(beats_start_min)
	        beats_start_stddev= get_stddev(beats_start)
		row.append(beats_start_stddev)
	        beats_start_count = get_count(beats_start)
		row.append(beats_start_count)
	        beats_start_sum = get_sum(beats_start)
		row.append(beats_start_sum)
		for i in row_beats_padding:
			row.append(i)
                
		writer.writerow(row)
		row=[]
		row=gral_info[:]

            # "--------sections---------------"
	    row_sec_padding=padding(217)	#blank spaces left at the end of the row
	    sec_c = hdf5_getters.get_sections_confidence(h5)
	    group_index=2
	    row=[]
	    row=gral_info[:]
	    row_front=padding(28)		#blank spaces left in front(empty spaces for bars,beats)
	    for i,item in enumerate(sec_c):
		row.append(group_index)
		row.append(i)
		for index in row_front:  	#padding blanks in front of the sections
			row.append(index)

		row.append(sec_c[i])
                sec_c_avg= get_avg(sec_c)
		row.append(sec_c_avg)
	        sec_c_max= get_max(sec_c)
		row.append(sec_c_max)
	        sec_c_min = get_min(sec_c)
		row.append(sec_c_min)
	        sec_c_stddev= get_stddev(sec_c)
		row.append(sec_c_stddev)
	        sec_c_count = get_count(sec_c)
		row.append(sec_c_count)
	        sec_c_sum = get_sum(sec_c)
		row.append(sec_c_sum)
	        sec_start = hdf5_getters.get_sections_start(h5)
		row.append(sec_start[i])	   
                sec_start_avg = get_avg(sec_start)
		row.append(sec_start_avg)
	        sec_start_max= get_max(sec_start)
		row.append(sec_start_max)
	        sec_start_min = get_min(sec_start)
		row.append(sec_start_min)
	        sec_start_stddev= get_stddev(sec_start)
		row.append(sec_start_stddev)
	        sec_start_count = get_count(sec_start)
		row.append(sec_start_count)
	        sec_start_sum = get_sum(sec_start)
		row.append(sec_start_sum)
		for i in row_sec_padding:	#appending the blank spaces at the end of the row
			row.append(i)
                

		writer.writerow(row)
		row=[]
		row=gral_info[:]


            #--------segments---------------"
	    row_seg_padding=padding(182)	#blank spaces at the end of the row
 	    row_front=padding(42)		#blank spaces left in front of segments
	    seg_c = hdf5_getters.get_segments_confidence(h5)
	    group_index=3
	    row=[]
	    row=gral_info[:]
	    for i,item in enumerate(seg_c):
		row.append(group_index)
		row.append(i)
		for index in row_front:  	#padding blanks in front of the segments
			row.append(index)

		row.append(seg_c[i])
                seg_c_avg= get_avg(seg_c)
		row.append(seg_c_avg)
	        seg_c_max= get_max(seg_c)
		row.append(seg_c_max)
	        seg_c_min = get_min(seg_c)
		row.append(seg_c_min)
	        seg_c_stddev= get_stddev(seg_c)
		row.append(seg_c_stddev)
	        seg_c_count = get_count(seg_c)
		row.append(seg_c_count)
	        seg_c_sum = get_sum(seg_c)
		row.append(seg_c_sum)
                seg_loud_max = hdf5_getters.get_segments_loudness_max(h5)
		row.append(seg_loud_max[i])
                seg_loud_max_avg= get_avg(seg_loud_max)
		row.append(seg_loud_max_avg)
	        seg_loud_max_max= get_max(seg_loud_max)
		row.append(seg_loud_max_max)
	        seg_loud_max_min = get_min(seg_loud_max)
		row.append(seg_loud_max_min)
	        seg_loud_max_stddev= get_stddev(seg_loud_max)
		row.append(seg_loud_max_stddev)
	        seg_loud_max_count = get_count(seg_loud_max)
		row.append(seg_loud_max_count)
	        seg_loud_max_sum = get_sum(seg_loud_max)
		row.append(seg_loud_max_sum)
	        seg_loud_max_time = hdf5_getters.get_segments_loudness_max_time(h5)
		row.append(seg_loud_max_time[i])
	        seg_loud_max_time_avg= get_avg(seg_loud_max_time)
		row.append(seg_loud_max_time_avg)
	        seg_loud_max_time_max= get_max(seg_loud_max_time)
		row.append(seg_loud_max_time_max)
	        seg_loud_max_time_min = get_min(seg_loud_max_time)
		row.append(seg_loud_max_time_min)
	        seg_loud_max_time_stddev= get_stddev(seg_loud_max_time)
		row.append(seg_loud_max_time_stddev)
	        seg_loud_max_time_count = get_count(seg_loud_max_time)
		row.append(seg_loud_max_time_count)
	        seg_loud_max_time_sum = get_sum(seg_loud_max_time)
		row.append(seg_loud_max_time_sum)
	        seg_loud_start = hdf5_getters.get_segments_loudness_start(h5)
		row.append(seg_loud_start[i])
	        seg_loud_start_avg= get_avg(seg_loud_start)
		row.append(seg_loud_start_avg)
	        seg_loud_start_max= get_max(seg_loud_start)
		row.append(seg_loud_start_max)
	        seg_loud_start_min = get_min(seg_loud_start)
		row.append(seg_loud_start_min)
	        seg_loud_start_stddev= get_stddev(seg_loud_start)
		row.append(seg_loud_start_stddev)
	        seg_loud_start_count = get_count(seg_loud_start)
		row.append(seg_loud_start_count)
	        seg_loud_start_sum = get_sum(seg_loud_start)					      
		row.append(seg_loud_start_sum)
	        seg_start = hdf5_getters.get_segments_start(h5)
		row.append(seg_start[i])
	        seg_start_avg= get_avg(seg_start)
		row.append(seg_start_avg)
	        seg_start_max= get_max(seg_start)
		row.append(seg_start_max)
	        seg_start_min = get_min(seg_start)
		row.append(seg_start_min)
	        seg_start_stddev= get_stddev(seg_start)
		row.append(seg_start_stddev)
	        seg_start_count = get_count(seg_start)
		row.append(seg_start_count)
	        seg_start_sum = get_sum(seg_start)
		row.append(seg_start_sum)
		for i in row_seg_padding:	#appending blank spaces at the end of the row
			row.append(i)
                
		writer.writerow(row)
		row=[]
		row=gral_info[:]

	    #----------segments pitch and timbre---------------"
	    row_seg2_padding=padding(14)	#blank spaces left at the end of the row
	    row_front=padding(77)		#blank spaces left at the front of the segments and timbre
	    seg_pitch = hdf5_getters.get_segments_pitches(h5)
	    transpose_pitch= seg_pitch.transpose()          #this is to tranpose the matrix,so we can have 12 rows
	    group_index=4
	    row=[]
	    row=gral_info[:]
	    for i,item in enumerate(transpose_pitch[0]):
		row.append(group_index)
		row.append(i)
		for index in row_front:  	#padding blanks in front of segments and timbre
			row.append(index)
	   
		row.append(transpose_pitch[0][i])
  		seg_pitch_avg= get_avg(transpose_pitch[0])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[0])	
		row.append(seg_pitch_max)
		seg_pitch_min = get_min(transpose_pitch[0])
		row.append(seg_pitch_min)
		seg_pitch_stddev= get_stddev(transpose_pitch[0])
		row.append(seg_pitch_stddev)
		seg_pitch_count = get_count(transpose_pitch[0])
		row.append(seg_pitch_count)
		seg_pitch_sum = get_sum(transpose_pitch[0])
		row.append(seg_pitch_sum)   
 		row.append(transpose_pitch[1][i])
 		seg_pitch_avg= get_avg(transpose_pitch[1])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[1])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[1])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[1])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[1])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[1])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[2][i])
 		seg_pitch_avg= get_avg(transpose_pitch[2])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[2])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[2])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[2])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[2])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[2])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[3][i])
 		seg_pitch_avg= get_avg(transpose_pitch[3])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[3])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[3])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[3])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[3])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[3])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[4][i])
 		seg_pitch_avg= get_avg(transpose_pitch[4])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[4])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[4])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[4])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[4])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[4])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[5][i])
 		seg_pitch_avg= get_avg(transpose_pitch[5])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[5])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[5])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[5])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[5])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[5])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[6][i])
 		seg_pitch_avg= get_avg(transpose_pitch[6])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[6])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[6])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[6])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[6])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[6])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[7][i])
 		seg_pitch_avg= get_avg(transpose_pitch[7])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[7])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[7])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[7])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[7])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[7])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[8][i])
 		seg_pitch_avg= get_avg(transpose_pitch[8])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[8])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[8])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[8])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[8])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[8])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[9][i])
 		seg_pitch_avg= get_avg(transpose_pitch[9])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[9])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[9])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[9])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[9])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[9])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[10][i])
 		seg_pitch_avg= get_avg(transpose_pitch[10])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[10])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[10])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[10])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[10])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[10])
		row.append(seg_pitch_sum)   
		row.append(transpose_pitch[11][i])
 		seg_pitch_avg= get_avg(transpose_pitch[11])
		row.append(seg_pitch_avg)
		seg_pitch_max= get_max(transpose_pitch[11])	
		row.append(seg_pitch_max)
	        seg_pitch_min = get_min(transpose_pitch[11])
		row.append(seg_pitch_min)
	        seg_pitch_stddev= get_stddev(transpose_pitch[11])
		row.append(seg_pitch_stddev)
	        seg_pitch_count = get_count(transpose_pitch[11])
		row.append(seg_pitch_count)
	        seg_pitch_sum = get_sum(transpose_pitch[11])
		row.append(seg_pitch_sum)   
		#timbre arrays
	        seg_timbre = hdf5_getters.get_segments_timbre(h5)
                transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
		row.append(transpose_timbre[0][i])
  		seg_timbre_avg= get_avg(transpose_timbre[0])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[0])	
		row.append(seg_timbre_max)
		seg_timbre_min = get_min(transpose_timbre[0])
		row.append(seg_timbre_min)
		seg_timbre_stddev=get_stddev(transpose_timbre[0])
		row.append(seg_timbre_stddev)
		seg_timbre_count = get_count(transpose_timbre[0])
		row.append(seg_timbre_count)
		seg_timbre_sum = get_sum(transpose_timbre[0])
		row.append(seg_timbre_sum)   
 		row.append(transpose_timbre[1][i])
 		seg_timbre_avg= get_avg(transpose_timbre[1])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[1])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[1])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[1])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[1])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[1])
		row.append(seg_timbre_sum)   
		row.append(transpose_timbre[2][i])
 		seg_timbre_avg= get_avg(transpose_timbre[2])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[2])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[2])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[2])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[2])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[2])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[3][i])
 		seg_timbre_avg= get_avg(transpose_timbre[3])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[3])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[3])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[3])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[3])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[3])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[4][i])
 		seg_timbre_avg= get_avg(transpose_timbre[4])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[4])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[4])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[4])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[4])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[4])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[5][i])
 		seg_timbre_avg= get_avg(transpose_timbre[5])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[5])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[5])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[5])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[5])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[5])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[6][i])
 		seg_timbre_avg= get_avg(transpose_timbre[6])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[6])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[6])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[6])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[6])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[6])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[7][i])
 		seg_timbre_avg= get_avg(transpose_timbre[7])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[7])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[7])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[7])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[7])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[7])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[8][i])
 		seg_timbre_avg= get_avg(transpose_timbre[8])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[8])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[8])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[8])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[8])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[8])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[9][i])
 		seg_timbre_avg= get_avg(transpose_timbre[9])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[9])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[9])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[9])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[9])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[9])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[10][i])
 		seg_timbre_avg= get_avg(transpose_timbre[10])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[10])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[10])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[10])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[10])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[10])
		row.append(seg_timbre_sum)   
		
		row.append(transpose_timbre[11][i])
 		seg_timbre_avg= get_avg(transpose_timbre[11])
		row.append(seg_timbre_avg)
		seg_timbre_max= get_max(transpose_timbre[11])	
		row.append(seg_timbre_max)
	        seg_timbre_min = get_min(transpose_timbre[11])
		row.append(seg_timbre_min)
	        seg_timbre_stddev= get_stddev(transpose_timbre[11])
		row.append(seg_timbre_stddev)
	        seg_timbre_count = get_count(transpose_timbre[11])
		row.append(seg_timbre_count)
	        seg_timbre_sum = get_sum(transpose_timbre[11])
		row.append(seg_timbre_sum)
	        for item in row_seg2_padding:
			row.append(item)
		writer.writerow(row)
		row=[]
		row=gral_info[:]


            # "--------tatums---------------"
	    tatms_c = hdf5_getters.get_tatums_confidence(h5)
	    group_index=5
	    row_front=padding(245)	#blank spaces left in front of tatums
	    row=[]
	    row=gral_info[:]
	    for i,item in enumerate(tatms_c):
		row.append(group_index)
		row.append(i)
		for item in row_front:	#appending blank spaces at the front of the row
			row.append(item)

		row.append(tatms_c[i])
		tatms_c_avg= get_avg(tatms_c)
		row.append(tatms_c_avg)
	 	tatms_c_max= get_max(tatms_c)
		row.append(tatms_c_max)
	        tatms_c_min = get_min(tatms_c)
		row.append(tatms_c_min)
	        tatms_c_stddev= get_stddev(tatms_c)
		row.append(tatms_c_stddev)
                tatms_c_count = get_count(tatms_c)
		row.append(tatms_c_count)
                tatms_c_sum = get_sum(tatms_c)
		row.append(tatms_c_sum)
                tatms_start = hdf5_getters.get_tatums_start(h5)
		row.append(tatms_start[i])
	        tatms_start_avg= get_avg(tatms_start)
		row.append(tatms_start_avg)
	        tatms_start_max= get_max(tatms_start)
		row.append(tatms_start_max)
	        tatms_start_min = get_min(tatms_start)
		row.append(tatms_start_min)
	        tatms_start_stddev= get_stddev(tatms_start)
		row.append(tatms_start_stddev)
	        tatms_start_count = get_count(tatms_start)
		row.append(tatms_start_count)
	        tatms_start_sum = get_sum(tatms_start)				   
		row.append(tatms_start_sum)
		writer.writerow(row)
		row=[]
		row=gral_info[:]


 
	    transpose_pitch= seg_pitch.transpose() #this is to tranpose the matrix,so we can have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_pitch_avg=[]
	    seg_pitch_max=[]
	    seg_pitch_min=[]
            seg_pitch_stddev=[]
            seg_pitch_count=[]
	    seg_pitch_sum=[]
            i=0
	    #Getting the aggregate values in the pitches array
	    for row in transpose_pitch:
		   seg_pitch_avg.append(get_avg(row))
		   seg_pitch_max.append(get_max(row))
	           seg_pitch_min.append(get_min(row))
		   seg_pitch_stddev.append(get_stddev(row))
		   seg_pitch_count.append(get_count(row))
                   seg_pitch_sum.append(get_sum(row))
		   i=i+1

	    #extracting information from the timbre array 
            transpose_timbre = seg_pitch.transpose() #tranposing matrix, to have 12 rows
	    #arrays containing the aggregate values of the 12 rows
	    seg_timbre_avg=[]
	    seg_timbre_max=[]
	    seg_timbre_min=[]
            seg_timbre_stddev=[]
            seg_timbre_count=[]
	    seg_timbre_sum=[]
            i=0
	    for row in transpose_timbre:
		   seg_timbre_avg.append(get_avg(row))
		   seg_timbre_max.append(get_max(row))
	           seg_timbre_min.append(get_min(row))
		   seg_timbre_stddev.append(get_stddev(row))
		   seg_timbre_count.append(get_count(row))
                   seg_timbre_sum.append(get_sum(row))
		   i=i+1








	    h5.close()
	    count=count+1;
	    print count;
Esempio n. 42
0
def process_filelist_test(filelist=None,
                          model=None,
                          tmpfilename=None,
                          npicks=None,
                          winsize=None,
                          finaldim=None,
                          K=1,
                          typecompress='picks'):
    """
    Main function, process all files in the list (as long as their artist
    is in testartist)
    INPUT
       filelist     - a list of song files
       model        - h5 file containing feats and year for all train songs
       tmpfilename  - where to save our processed features
       npicks       - number of segments to pick per song
       winsize      - size of each segment we pick
       finaldim     - how many values do we keep
       K            - param of KNN (default 1)
       typecompress - feature type, 'picks', 'corrcoeff' or 'cov'
                      must be the same as in training
    """
    # sanity check
    for arg in locals().values():
        assert not arg is None, 'process_filelist_test, missing an argument, something still None'
    if os.path.isfile(tmpfilename):
        print 'ERROR: file', tmpfilename, 'already exists.'
        return
    if not os.path.isfile(model):
        print 'ERROR: model', model, 'does not exist.'
        return
    # create kdtree
    h5model = tables.openFile(model, mode='r')
    assert h5model.root.data.feats.shape[
        1] == finaldim, 'inconsistency in final dim'
    kd = ANN.kdtree(h5model.root.data.feats)
    # create outputfile
    output = tables.openFile(tmpfilename, mode='a')
    group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION')
    output.createEArray(group,
                        'year_real',
                        tables.IntAtom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    output.createEArray(group,
                        'year_pred',
                        tables.Float64Atom(shape=()), (0, ),
                        '',
                        expectedrows=len(filelist))
    # random projection
    ndim = 12  # fixed in this dataset
    if typecompress == 'picks':
        randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim)
    elif typecompress == 'corrcoeff' or typecompress == 'cov':
        randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim)
    elif typecompress == 'avgcov':
        randproj = RANDPROJ.proj_point5(90, finaldim)
    else:
        assert False, 'Unknown type of compression: ' + str(typecompress)
    # go through files
    cnt_f = 0
    for f in filelist:
        cnt_f += 1
        if cnt_f % 5000 == 0:
            print 'TESTING FILE #' + str(cnt_f)
        # check file
        h5 = GETTERS.open_h5_file_read(f)
        artist_id = GETTERS.get_artist_id(h5)
        year = GETTERS.get_year(h5)
        track_id = GETTERS.get_track_id(h5)
        h5.close()
        if year <= 0:  # probably useless but...
            continue
        if typecompress == 'picks':
            # we have a train artist with a song year, we're good
            bttimbre = get_bttimbre(f)
            if bttimbre is None:
                continue
            # we even have normal features, awesome!
            processed_feats = CBTF.extract_and_compress(bttimbre,
                                                        npicks,
                                                        winsize,
                                                        finaldim,
                                                        randproj=randproj)
        elif typecompress == 'corrcoeff':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.corr_and_compress(timbres,
                                                     finaldim,
                                                     randproj=randproj)
        elif typecompress == 'cov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.cov_and_compress(timbres,
                                                    finaldim,
                                                    randproj=randproj)
        elif typecompress == 'avgcov':
            h5 = GETTERS.open_h5_file_read(f)
            timbres = GETTERS.get_segments_timbre(h5).T
            h5.close()
            processed_feats = CBTF.avgcov_and_compress(timbres,
                                                       finaldim,
                                                       randproj=randproj)
        else:
            assert False, 'Unknown type of compression: ' + str(typecompress)
        if processed_feats is None:
            continue
        if processed_feats.shape[0] == 0:
            continue
        # do prediction
        year_pred = do_prediction(processed_feats, kd, h5model, K)
        # add pred and ground truth to output
        if not year_pred is None:
            output.root.data.year_real.append([year])
            output.root.data.year_pred.append([year_pred])
    # close output and model
    del kd
    h5model.close()
    output.close()
    # done
    return
Esempio n. 43
0
def write_line(output, getter,data,song_nb):
    line = hdf5_getters.get_track_id(data).decode('UTF-8')
    h5getter = getattr(hdf5_getters, getter)
    line += '\t' + toJson(h5getter(data,song_nb))
    line += '\n'
    output.write(line)
    counter = 0
    file_io_counter = 0
    for root, dirs, files in os.walk(os.getcwd()):
        for name in files:
            if name.endswith(".h5"):
                if process_completion % 10000 == 0:
                    print "done :", process_completion/10000.0, "%"
                    process_completion += 1
                else:
                    process_completion += 1

                tempPath = os.path.abspath(os.path.join(root,name))
                h5file = hdf5_getters.open_h5_file_read(tempPath)

                #data extract 1
                track_id_str = hdf5_getters.get_track_id(h5file)
                song_id_str = hdf5_getters.get_song_id(h5file)
                album_name_str = unicodedata.normalize('NFKD', unicode(hdf5_getters.get_release(h5file),encoding='ASCII',errors='ignore'))
                album_name_str = str(album_name_str).replace(","," ").replace("'","").replace("-"," ").replace("("," ").replace(")"," ").replace("/"," ").replace("\\"," ")
                year_str = str(hdf5_getters.get_year(h5file))

                counter += 1
                file_io_counter += 1
                if file_io_counter % 100 == 0:
                    if counter == 1:
                        my_array_data_extract_1 = numpy.array([track_id_str, song_id_str, album_name_str, year_str])
                    else :
                        my_array_data_extract_1 = numpy.vstack((my_array_data_extract_1,numpy.array([track_id_str, song_id_str, album_name_str, year_str])))

                    f_handle = file('msd_data_extract_1.bin','a')
                    numpy.savetxt(f_handle, my_array_data_extract_1, delimiter='|',fmt='%s')
Esempio n. 45
0
# This script converts the summary H5 files only 300MB to a csv file
# Run only on the Master Node since h5_getters cannot open a remote(ie. HDFS) file



if __name__ == "__main__":

    with open("fields.csv", "wb") as f:
        writer = csv.writer(f) #initialize the csv writer
        
        # for each track in the summary file, get the 11 fields and output to csv
        h5_file = hdf5_getters.open_h5_file_read('msd_summary_file.h5')
        for k in range(1000000):
            print "index!!!: ", k
            id = hdf5_getters.get_track_id(h5_file,k) #get track_id TRA13e39..
            title = hdf5_getters.get_title(h5_file,k) # get song title
            artist_name = hdf5_getters.get_artist_name(h5_file,k)
            year = int(hdf5_getters.get_year(h5_file,k))
            hotness= float(hdf5_getters.get_song_hotttnesss(h5_file,k))
            artist_familiarity = float(hdf5_getters.get_artist_familiarity(h5_file,k))
            f5 = int(hdf5_getters.get_key(h5_file,k)) #get key
            f2 = float(hdf5_getters.get_loudness(h5_file,k)) #get loudness
            f1 = float(hdf5_getters.get_tempo(h5_file,k)) #get tempo
            f4 = int(hdf5_getters.get_duration(h5_file,k)) #get duration
            f3 = float(hdf5_getters.get_time_signature(h5_file,k)) #get time signature
            
            
            # Get rid of missing info and change invalid numbers for meta data
            
            if not artist_name:
def main():
    outputFile1 = open('SongCSV.csv', 'w')
    csvRowString = ""

    #################################################
    #if you want to prompt the user for the order of attributes in the csv,
    #leave the prompt boolean set to True
    #else, set 'prompt' to False and set the order of attributes in the 'else'
    #clause
    prompt = False
    #################################################
    if prompt == True:
        while prompt:

            prompt = False

            csvAttributeString = raw_input("\n\nIn what order would you like the colums of the CSV file?\n" +
                "Please delineate with commas. The options are: " +
                "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"+
                " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo," +
                " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n" +
                "For example, you may write \"Title, Tempo, Duration\"...\n\n" +
                "...or exit by typing 'exit'.\n\n")

            csvAttributeList = re.split('\W+', csvAttributeString)
            for i, v in enumerate(csvAttributeList):
                csvAttributeList[i] = csvAttributeList[i].lower()

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"


                if attribute == 'AlbumID'.lower():
                    csvRowString += 'AlbumID'
                elif attribute == 'AlbumName'.lower():
                    csvRowString += 'AlbumName'
                elif attribute == 'ArtistID'.lower():
                    csvRowString += 'ArtistID'
                elif attribute == 'ArtistLatitude'.lower():
                    csvRowString += 'ArtistLatitude'
                elif attribute == 'ArtistLocation'.lower():
                    csvRowString += 'ArtistLocation'
                elif attribute == 'ArtistLongitude'.lower():
                    csvRowString += 'ArtistLongitude'
                elif attribute == 'ArtistName'.lower():
                    csvRowString += 'ArtistName'
                elif attribute == 'Danceability'.lower():
                    csvRowString += 'Danceability'
                elif attribute == 'Duration'.lower():
                    csvRowString += 'Duration'
                elif attribute == 'KeySignature'.lower():
                    csvRowString += 'KeySignature'
                elif attribute == 'KeySignatureConfidence'.lower():
                    csvRowString += 'KeySignatureConfidence'
                elif attribute == 'SongID'.lower():
                    csvRowString += "SongID"
                elif attribute == 'Tempo'.lower():
                    csvRowString += 'Tempo'
                elif attribute == 'TimeSignature'.lower():
                    csvRowString += 'TimeSignature'
                elif attribute == 'TimeSignatureConfidence'.lower():
                    csvRowString += 'TimeSignatureConfidence'
                elif attribute == 'Title'.lower():
                    csvRowString += 'Title'
                elif attribute == 'Year'.lower():
                    csvRowString += 'Year'
                elif attribute == 'track_id'.lower():
                    csvRowString += 'track_id' 
                elif attribute == 'artist_familiarity'.lower():
                    csvRowString += 'artist_familiarity' 
                elif attribute == 'artist_hotttnesss'.lower():
                    csvRowString += 'artist_hotttnesss' 
                elif attribute == 'artist_mbid'.lower():
                    csvRowString += 'artist_mbid'
                elif attribute == 'artist_playmeid'.lower():
                    csvRowString += 'artist_playmeid'
                elif attribute == 'artist_7digitalid'.lower():
                    csvRowString += 'artist_7digitalid'
                elif attribute == 'release'.lower():
                    csvRowString += 'release' 
                elif attribute == 'release_7digitalid'.lower():
                    csvRowString += 'release_7digitalid' 
                elif attribute == 'song_hotttnesss'.lower():
                    csvRowString += 'song_hotttnesss'
                elif attribute == 'track_7digitalid'.lower():
                    csvRowString += 'track_7digitalid' 
                elif attribute == 'analysis_sample_rate'.lower():
                    csvRowString += 'analysis_sample_rate' 
                elif attribute == 'audio_md5'.lower():
                    csvRowString += 'audio_md5' 
                elif attribute == 'end_of_fade_in'.lower():
                    csvRowString += 'end_of_fade_in'
                elif attribute == 'energy'.lower():
                    csvRowString += 'energy'  
                elif attribute == 'key'.lower():
                    csvRowString += 'key'  
                elif attribute == 'key_confidence'.lower():
                    csvRowString += 'key_confidence' 
                elif attribute == 'loudness'.lower():
                    csvRowString += 'loudness' 
                elif attribute == 'mode'.lower():
                    csvRowString += 'mode'  
                elif attribute == 'mode_confidence'.lower():
                    csvRowString += 'mode_confidence'   
                elif attribute == 'start_of_fade_out'.lower():
                    csvRowString += 'start_of_fade_out'                                                                                
                elif attribute == 'Exit'.lower():
                    sys.exit()
                else:
                    prompt = True
                    print "=============="
                    print "I believe there has been an error with the input."
                    print "=============="
                    break

                csvRowString += ","

            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex-1]
            csvRowString += "\n"
            outputFile1.write(csvRowString);
            csvRowString = ""
    #else, if you want to hard code the order of the csv file and not prompt
    #the user, 
    else:
        #################################################
        #change the order of the csv file here
        #Default is to list all available attributes (in alphabetical order)
        csvRowString = ("SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,"+
            "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,"+
            "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"+
            "Title,Year,track_id,artist_hotttnesss,artist_mbid,artist_playmeid,artist_7digitalid,"+
            "release,release_7digitalid,song_hotttnesss,track_7digitalid,analysis_sample_rate,audio_md5,"+
            "end_of_fade_in,energy,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out")
        #################################################

        csvAttributeList = re.split('\W+', csvRowString)
        for i, v in enumerate(csvAttributeList):
            csvAttributeList[i] = csvAttributeList[i].lower()
        outputFile1.write("SongNumber,");
        outputFile1.write(csvRowString + "\n");
        csvRowString = ""  

    #################################################


    #Set the basedir here, the root directory from which the search
    #for files stored in a (hierarchical data structure) will originate
    basedir = "/vagrant/genrepython/MillionSongSubset" # "." As the default means the current directory
    ext = ".h5" #Set the extension here. H5 is the extension for HDF5 files.
    #################################################

    #FOR LOOP
    for root, dirs, files in os.walk(basedir):        
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            print f

            songH5File = hdf5_getters.open_h5_file_read(f)
            song = Song(str(hdf5_getters.get_song_id(songH5File)))

            testDanceability = hdf5_getters.get_danceability(songH5File)
            # print type(testDanceability)
            # print ("Here is the danceability: ") + str(testDanceability)

            song.artistID = str(hdf5_getters.get_artist_id(songH5File))
            song.albumID = str(hdf5_getters.get_release_7digitalid(songH5File))
            song.albumName = str(hdf5_getters.get_release(songH5File))
            song.artistLatitude = str(hdf5_getters.get_artist_latitude(songH5File))
            song.artistLocation = str(hdf5_getters.get_artist_location(songH5File))
            song.artistLongitude = str(hdf5_getters.get_artist_longitude(songH5File))
            song.artistName = str(hdf5_getters.get_artist_name(songH5File))
            song.danceability = str(hdf5_getters.get_danceability(songH5File))
            song.duration = str(hdf5_getters.get_duration(songH5File))
            # song.setGenreList()
            song.keySignature = str(hdf5_getters.get_key(songH5File))
            song.keySignatureConfidence = str(hdf5_getters.get_key_confidence(songH5File))
            # song.lyrics = None
            # song.popularity = None
            song.tempo = str(hdf5_getters.get_tempo(songH5File))
            song.timeSignature = str(hdf5_getters.get_time_signature(songH5File))
            song.timeSignatureConfidence = str(hdf5_getters.get_time_signature_confidence(songH5File))
            song.title = str(hdf5_getters.get_title(songH5File))
            song.year = str(hdf5_getters.get_year(songH5File))
            song.track_id = str(hdf5_getters.get_track_id(songH5File))
            song.artist_familiarity = str(hdf5_getters.get_artist_familiarity(songH5File))
            song.artist_hotttnesss = str(hdf5_getters.get_artist_hotttnesss(songH5File))
            song.artist_mbid = str(hdf5_getters.get_artist_mbid(songH5File))
            song.artist_playmeid = str(hdf5_getters.get_artist_playmeid(songH5File))
            song.artist_7digitalid = str(hdf5_getters.get_artist_7digitalid(songH5File))
            song.release = str(hdf5_getters.get_release(songH5File))
            song.release_7digitalid = str(hdf5_getters.get_release_7digitalid(songH5File))
            song.song_hotttnesss = str(hdf5_getters.get_song_hotttnesss(songH5File))
            song.track_7digitalid = str(hdf5_getters.get_track_7digitalid(songH5File))
            song.analysis_sample_rate = str(hdf5_getters.get_analysis_sample_rate(songH5File))
            song.audio_md5 = str(hdf5_getters.get_audio_md5(songH5File))
            song.end_of_fade_in = str(hdf5_getters.get_end_of_fade_in(songH5File))
            song.energy = str(hdf5_getters.get_energy(songH5File))
            song.key = str(hdf5_getters.get_key(songH5File))
            song.key_confidence = str(hdf5_getters.get_key_confidence(songH5File))
            song.loudness = str(hdf5_getters.get_loudness(songH5File))
            song.mode = str(hdf5_getters.get_mode(songH5File))
            song.mode_confidence = str(hdf5_getters.get_mode_confidence(songH5File))
            song.start_of_fade_out = str(hdf5_getters.get_start_of_fade_out(songH5File))

            #print song count
            csvRowString += str(song.songCount) + ","

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"

                if attribute == 'AlbumID'.lower():
                    csvRowString += song.albumID
                elif attribute == 'AlbumName'.lower():
                    albumName = song.albumName
                    albumName = albumName.replace(',',"")
                    csvRowString += "\"" + albumName + "\""
                elif attribute == 'ArtistID'.lower():
                    csvRowString += "\"" + song.artistID + "\""
                elif attribute == 'ArtistLatitude'.lower():
                    latitude = song.artistLatitude
                    if latitude == 'nan':
                        latitude = ''
                    csvRowString += latitude
                elif attribute == 'ArtistLocation'.lower():
                    location = song.artistLocation
                    location = location.replace(',','')
                    csvRowString += "\"" + location + "\""
                elif attribute == 'ArtistLongitude'.lower():
                    longitude = song.artistLongitude
                    if longitude == 'nan':
                        longitude = ''
                    csvRowString += longitude                
                elif attribute == 'ArtistName'.lower():
                    csvRowString += "\"" + song.artistName + "\""                
                elif attribute == 'Danceability'.lower():
                    csvRowString += song.danceability
                elif attribute == 'Duration'.lower():
                    csvRowString += song.duration
                elif attribute == 'KeySignature'.lower():
                    csvRowString += song.keySignature
                elif attribute == 'KeySignatureConfidence'.lower():
                    # print "key sig conf: " + song.timeSignatureConfidence                                 
                    csvRowString += song.keySignatureConfidence
                elif attribute == 'SongID'.lower():
                    csvRowString += "\"" + song.id + "\""
                elif attribute == 'Tempo'.lower():
                    # print "Tempo: " + song.tempo
                    csvRowString += song.tempo
                elif attribute == 'TimeSignature'.lower():
                    csvRowString += song.timeSignature
                elif attribute == 'TimeSignatureConfidence'.lower():
                    # print "time sig conf: " + song.timeSignatureConfidence                                   
                    csvRowString += song.timeSignatureConfidence
                elif attribute == 'Title'.lower():
                    csvRowString += "\"" + song.title + "\""
                elif attribute == 'Year'.lower():
                    csvRowString += song.year
                elif attribute == 'track_id'.lower():
                    csvRowString += song.track_id 
                elif attribute == 'artist_familiarity'.lower():
                    csvRowString += song.artist_familiarity  
                elif attribute == 'artist_hotttnesss'.lower():
                    csvRowString += song.artist_hotttnesss 
                elif attribute == 'artist_mbid'.lower():
                    csvRowString += song.artist_mbid
                elif attribute == 'artist_playmeid'.lower():
                    csvRowString += song.artist_playmeid 
                elif attribute == 'artist_7digitalid'.lower():
                    csvRowString += song.artist_7digitalid 
                elif attribute == 'release'.lower():
                    csvRowString += song.release
                elif attribute == 'release_7digitalid'.lower():
                    csvRowString += song.release_7digitalid 
                elif attribute == 'song_hotttnesss'.lower():
                    csvRowString += song.song_hotttnesss  
                elif attribute == 'track_7digitalid'.lower():
                    csvRowString += song.track_7digitalid 
                elif attribute == 'analysis_sample_rate'.lower():
                    csvRowString += song.analysis_sample_rate  
                elif attribute == 'audio_md5'.lower():
                    csvRowString += song.audio_md5 
                elif attribute == 'end_of_fade_in'.lower():
                    csvRowString += song.end_of_fade_in  
                elif attribute == 'energy'.lower():
                    csvRowString += song.energy 
                elif attribute == 'key'.lower():
                    csvRowString += song.key 
                elif attribute == 'key_confidence'.lower():
                    csvRowString += song.key_confidence 
                elif attribute == 'loudness'.lower():
                    csvRowString += song.loudness
                elif attribute == 'mode'.lower():
                    csvRowString += song.mode   
                elif attribute == 'mode_confidence'.lower():
                    csvRowString += song.mode_confidence    
                elif attribute == 'start_of_fade_out'.lower():
                    csvRowString += song.start_of_fade_out                                                                              
                else:
                    csvRowString += "Erm. This didn't work. Error. :( :(\n"

                csvRowString += ","

            #Remove the final comma from each row in the csv
            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex-1]
            csvRowString += "\n"
            outputFile1.write(csvRowString)
            csvRowString = ""

            songH5File.close()

    outputFile1.close()
Esempio n. 47
0
def main():
    outputFile = open('songs.csv', 'w')
    writer = csv.writer(outputFile)

    csvRowString = "song_number,artist_familiarity,artist_hotttnesss,artist_id,artist_mbid,artist_playmeid,artist_7digitalid,artist_latitude,artist_longitude,artist_location,artist_name,release,release_7digitalid,song_id,song_hotttnesss,title,track_7digitalid,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id,year"

    outputFile.write(csvRowString + "\n")
    csvRowString = ""

    #################################################
    #Set the basedir here, the root directory from which the search
    #for files stored in a (hierarchical data structure) will originate
    basedir = "."  # "." As the default means the current directory
    ext = ".H5"  #Set the extension here. H5 is the extension for HDF5 files.
    #################################################

    #FOR LOOP
    songCount = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print(f)

            songH5File = hdf5_getters.open_h5_file_read(f)

            values = [
                songCount,
                hdf5_getters.get_artist_familiarity(songH5File),
                hdf5_getters.get_artist_hotttnesss(songH5File),
                hdf5_getters.get_artist_id(songH5File),
                hdf5_getters.get_artist_mbid(songH5File),
                hdf5_getters.get_artist_playmeid(songH5File),
                hdf5_getters.get_artist_7digitalid(songH5File),
                hdf5_getters.get_artist_latitude(songH5File),
                hdf5_getters.get_artist_longitude(songH5File),
                hdf5_getters.get_artist_location(songH5File),
                hdf5_getters.get_artist_name(songH5File),
                hdf5_getters.get_release(songH5File),
                hdf5_getters.get_release_7digitalid(songH5File),
                hdf5_getters.get_song_id(songH5File),
                hdf5_getters.get_song_hotttnesss(songH5File),
                hdf5_getters.get_title(songH5File),
                hdf5_getters.get_track_7digitalid(songH5File),
                hdf5_getters.get_analysis_sample_rate(songH5File),
                hdf5_getters.get_audio_md5(songH5File),
                hdf5_getters.get_danceability(songH5File),
                hdf5_getters.get_duration(songH5File),
                hdf5_getters.get_end_of_fade_in(songH5File),
                hdf5_getters.get_energy(songH5File),
                hdf5_getters.get_key(songH5File),
                hdf5_getters.get_key_confidence(songH5File),
                hdf5_getters.get_loudness(songH5File),
                hdf5_getters.get_mode(songH5File),
                hdf5_getters.get_mode_confidence(songH5File),
                hdf5_getters.get_start_of_fade_out(songH5File),
                hdf5_getters.get_tempo(songH5File),
                hdf5_getters.get_time_signature(songH5File),
                hdf5_getters.get_time_signature_confidence(songH5File),
                hdf5_getters.get_track_id(songH5File),
                hdf5_getters.get_year(songH5File)
            ]
            songH5File.close()
            songCount = songCount + 1

            writer.writerow(values)

    outputFile.close()
Esempio n. 48
0
                #segments_timbre = ','.join(str(e) for e in GETTERS.get_segments_timbre(h5, i)) # array
                similar_artists = ','.join(
                    str(e)
                    for e in GETTERS.get_similar_artists(h5, i))  # array
                song_hotttnesss = GETTERS.get_song_hotttnesss(h5, i)
                song_id = GETTERS.get_song_id(h5, i)
                start_of_fade_out = GETTERS.get_start_of_fade_out(h5, i)
                #tatums_confidence = ','.join(str(e) for e in GETTERS.get_tatums_confidence(h5, i)) # array
                #tatums_start = ','.join(str(e) for e in GETTERS.get_tatums_start(h5, i)) # array
                tempo = GETTERS.get_tempo(h5, i)
                time_signature = GETTERS.get_time_signature(h5, i)
                time_signature_confidence = GETTERS.get_time_signature_confidence(
                    h5, i)
                title = GETTERS.get_title(h5, i)
                track_7digitalid = GETTERS.get_track_7digitalid(h5, i)
                track_id = GETTERS.get_track_id(h5, i)
                year = GETTERS.get_year(h5, i)
                loops += 1

                #row = {'analysis_sample_rate':analysis_sample_rate,'artist_7digitalid':artist_7digitalid,'artist_familiarity':artist_familiarity,'artist_hotttnesss':artist_hotttnesss,'artist_id':artist_id,'artist_latitude':artist_latitude,'artist_location':artist_location,'artist_longitude':artist_longitude,'artist_mbid':artist_mbid,'artist_mbtags_count':artist_mbtags_count,'artist_mbtags':artist_mbtags,'artist_name':artist_name,'artist_terms_freq':artist_terms_freq,'artist_terms_weight':artist_terms_weight,'artist_terms':artist_terms,'audio_md5':audio_md5,'bars_confidence':bars_confidence,'bars_start':bars_start,'beats_confidence':beats_confidence,'beats_start':beats_start,'danceability':danceability,'duration':duration,'end_of_fade_in':end_of_fade_in,'energy':energy,'key_confidence':key_confidence,'key':key,'loudness':loudness,'mode_confidence':mode_confidence,'mode':mode,'release_7digitalid':release_7digitalid,'release':release,'sections_confidence':sections_confidence,'sections_start':sections_start,'segments_confidence':segments_confidence,'segments_loudness_max_time':segments_loudness_max_time,'segments_loudness_max':segments_loudness_max,'segments_loudness_start':segments_loudness_start,'segments_pitches':segments_pitches,'segments_start':segments_start,'segments_timbre':segments_timbre,'similar_artists':similar_artists,'song_hotttnesss':song_hotttnesss,'song_id':song_id,'start_of_fade_out':start_of_fade_out,'tatums_confidence':tatums_confidence,'tatums_start':tatums_start,'tempo':tempo,'time_signature_confidence':time_signature_confidence,'time_signature':time_signature,'title':title,'track_7digitalid':track_7digitalid,'track_id':track_id,'year':year,}
                row = {
                    'analysis_sample_rate': analysis_sample_rate,
                    'artist_7digitalid': artist_7digitalid,
                    'artist_familiarity': artist_familiarity,
                    'artist_hotttnesss': artist_hotttnesss,
                    'artist_id': artist_id,
                    'artist_latitude': artist_latitude,
                    'artist_location': artist_location,
                    'artist_longitude': artist_longitude,
                    'artist_mbid': artist_mbid,
                    'artist_mbtags_count': artist_mbtags_count,
                    int(
                        math.floor(len(timbre_pitches_old)) /
                        smoothing_factor)):
                segments = segments_timbre_old[(smoothing_factor *
                                                i):(smoothing_factor *
                                                    (i + 1))]
                # calculate mean frequency of each note over a block of 5 time segments
                segments_mean = map(mean, zip(*segments))
                segments_timbre_smoothed.append(segments_mean)
            print('song count: {0}'.format(count + 1))
            h5_subdict = dict()
            h5_subdict['title'] = hdf5_getters.get_title(h5)
            h5_subdict['artist_name'] = hdf5_getters.get_artist_name(h5)
            h5_subdict['year'] = hdf5_getters.get_year(h5)
            h5_subdict['chord_changes'] = chord_changes
            h5_subdict['duration'] = hdf5_getters.get_duration(h5)
            h5_subdict['timbre'] = segments_timbre_smoothed
            track_id = hdf5_getters.get_track_id(h5)
            all_song_data[track_id] = h5_subdict
            print(
                'Song {0} finished processing. Total time elapsed: {1} seconds'
                .format(count, str(time.time() - start_time)))
        h5.close()

all_song_data_sorted = dict(
    sorted(all_song_data.items(), key=lambda k: k[1]['year']))
sortedpitchdata = 'output/msd_EDM_sortedpitchdata_' + re.sub(
    '/', '', sys.argv[1]) + '.txt'
with open(sortedpitchdata, 'w') as text_file:
    text_file.write(str(all_song_data_sorted))
Esempio n. 50
0
def main():
    outputFile1 = open('SongCSV.csv', 'w')
    csvRowString = ""

    #################################################
    #if you want to prompt the user for the order of attributes in the csv,
    #leave the prompt boolean set to True
    #else, set 'prompt' to False and set the order of attributes in the 'else'
    #clause
    prompt = False
    #################################################
    if prompt == True:
        while prompt:

            prompt = False

            csvAttributeString = raw_input(
                "\n\nIn what order would you like the colums of the CSV file?\n"
                + "Please delineate with commas. The options are: " +
                "AlbumName, AlbumID, ArtistID, ArtistLatitude, ArtistLocation, ArtistLongitude,"
                +
                " ArtistName, Danceability, Duration, KeySignature, KeySignatureConfidence, Tempo,"
                +
                " SongID, TimeSignature, TimeSignatureConfidence, Title, and Year.\n\n"
                +
                "For example, you may write \"Title, Tempo, Duration\"...\n\n"
                + "...or exit by typing 'exit'.\n\n")

            csvAttributeList = re.split('\W+', csvAttributeString)
            for i, v in enumerate(csvAttributeList):
                csvAttributeList[i] = csvAttributeList[i].lower()

            for attribute in csvAttributeList:
                # print "Here is the attribute: " + attribute + " \n"

                if attribute == 'AlbumID'.lower():
                    csvRowString += 'AlbumID'
                elif attribute == 'AlbumName'.lower():
                    csvRowString += 'AlbumName'
                elif attribute == 'ArtistID'.lower():
                    csvRowString += 'ArtistID'
                elif attribute == 'ArtistLatitude'.lower():
                    csvRowString += 'ArtistLatitude'
                elif attribute == 'ArtistLocation'.lower():
                    csvRowString += 'ArtistLocation'
                elif attribute == 'ArtistLongitude'.lower():
                    csvRowString += 'ArtistLongitude'
                elif attribute == 'ArtistName'.lower():
                    csvRowString += 'ArtistName'
                elif attribute == 'Danceability'.lower():
                    csvRowString += 'Danceability'
                elif attribute == 'Duration'.lower():
                    csvRowString += 'Duration'
                elif attribute == 'KeySignature'.lower():
                    csvRowString += 'KeySignature'
                elif attribute == 'KeySignatureConfidence'.lower():
                    csvRowString += 'KeySignatureConfidence'
                elif attribute == 'SongID'.lower():
                    csvRowString += "SongID"
                elif attribute == 'Tempo'.lower():
                    csvRowString += 'Tempo'
                elif attribute == 'TimeSignature'.lower():
                    csvRowString += 'TimeSignature'
                elif attribute == 'TimeSignatureConfidence'.lower():
                    csvRowString += 'TimeSignatureConfidence'
                elif attribute == 'Title'.lower():
                    csvRowString += 'Title'
                elif attribute == 'Year'.lower():
                    csvRowString += 'Year'
                elif attribute == 'Exit'.lower():
                    sys.exit()
                else:
                    prompt = True
                    print "=============="
                    print "I believe there has been an error with the input."
                    print "=============="
                    break

                csvRowString += ","

            lastIndex = len(csvRowString)
            csvRowString = csvRowString[0:lastIndex - 1]
            csvRowString += "\n"
            outputFile1.write(csvRowString)
            csvRowString = ""
    #else, if you want to hard code the order of the csv file and not prompt
    #the user,
    else:
        #################################################
        #change the order of the csv file here
        #Default is to list all available attributes (in alphabetical order)
        csvRowString = (
            "SongID,AlbumID,AlbumName,TrackId,ArtistID,ArtistLatitude,ArtistLocation,"
            +
            "ArtistLongitude,ArtistName,Danceability,Duration,KeySignature," +
            "KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,"
            + "Title,Year")
        #################################################

        csvAttributeList = re.split('\W+', csvRowString)
        for i, v in enumerate(csvAttributeList):
            csvAttributeList[i] = csvAttributeList[i].lower()
        outputFile1.write("SongNumber,")
        outputFile1.write(csvRowString + "\n")
        csvRowString = ""

    #################################################

    #Set the basedir here, the root directory from which the search
    #for files stored in a (hierarchical data structure) will originate
    basedir = "/home/umwangye/millonsong/MillionSongSubset/data/"  # "." As the default means the current directory
    ext = ".h5"  #Set the extension here. H5 is the extension for HDF5 files.
    #################################################

    #FOR LOOP
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        for f in files:
            print f

            songH5File = hdf5_getters.open_h5_file_read(f)
            #song = Song(str(hdf5_getters.get_song_id(songH5File)))

            #testDanceability = hdf5_getters.get_danceability(songH5File)
            # print type(testDanceability)
            # print ("Here is the danceability: ") + str(testDanceability)
            numPerH5 = hdf5_getters.get_num_songs(songH5File)

            for cnt in range(numPerH5):
                song = Song(str(hdf5_getters.get_song_id(songH5File, cnt)))
                song.trackId = str(hdf5_getters.get_track_id(songH5File, cnt))
                song.artistID = str(hdf5_getters.get_artist_id(
                    songH5File, cnt))
                song.albumID = str(
                    hdf5_getters.get_release_7digitalid(songH5File, cnt))
                song.albumName = str(hdf5_getters.get_release(songH5File, cnt))
                song.artistLatitude = str(
                    hdf5_getters.get_artist_latitude(songH5File, cnt))
                song.artistLocation = str(
                    hdf5_getters.get_artist_location(songH5File, cnt))
                song.artistLongitude = str(
                    hdf5_getters.get_artist_longitude(songH5File, cnt))
                song.artistName = str(
                    hdf5_getters.get_artist_name(songH5File, cnt))
                song.danceability = str(
                    hdf5_getters.get_danceability(songH5File, cnt))
                song.duration = str(hdf5_getters.get_duration(songH5File, cnt))
                # song.setGenreList()
                song.keySignature = str(hdf5_getters.get_key(songH5File, cnt))
                song.keySignatureConfidence = str(
                    hdf5_getters.get_key_confidence(songH5File, cnt))
                # song.lyrics = None
                # song.popularity = None
                song.tempo = str(hdf5_getters.get_tempo(songH5File, cnt))
                song.timeSignature = str(
                    hdf5_getters.get_time_signature(songH5File, cnt))
                song.timeSignatureConfidence = str(
                    hdf5_getters.get_time_signature_confidence(
                        songH5File, cnt))
                song.title = str(hdf5_getters.get_title(songH5File, cnt))
                song.year = str(hdf5_getters.get_year(songH5File, cnt))

                #print song count
                csvRowString += str(song.songCount) + ","

                for attribute in csvAttributeList:
                    # print "Here is the attribute: " + attribute + " \n"

                    if attribute == 'AlbumID'.lower():
                        csvRowString += song.albumID
                    elif attribute == 'AlbumName'.lower():
                        albumName = song.albumName
                        albumName = albumName.replace(',', "")
                        csvRowString += "\"" + albumName + "\""
                    elif attribute == 'TrackId'.lower():
                        csvRowString += song.trackId
                    elif attribute == 'ArtistID'.lower():
                        csvRowString += "\"" + song.artistID + "\""
                    elif attribute == 'ArtistLatitude'.lower():
                        latitude = song.artistLatitude
                        if latitude == 'nan':
                            latitude = ''
                        csvRowString += latitude
                    elif attribute == 'ArtistLocation'.lower():
                        location = song.artistLocation
                        location = location.replace(',', '')
                        csvRowString += "\"" + location + "\""
                    elif attribute == 'ArtistLongitude'.lower():
                        longitude = song.artistLongitude
                        if longitude == 'nan':
                            longitude = ''
                        csvRowString += longitude
                    elif attribute == 'ArtistName'.lower():
                        csvRowString += "\"" + song.artistName + "\""
                    elif attribute == 'Danceability'.lower():
                        csvRowString += song.danceability
                    elif attribute == 'Duration'.lower():
                        csvRowString += song.duration
                    elif attribute == 'KeySignature'.lower():
                        csvRowString += song.keySignature
                    elif attribute == 'KeySignatureConfidence'.lower():
                        # print "key sig conf: " + song.timeSignatureConfidence
                        csvRowString += song.keySignatureConfidence
                    elif attribute == 'SongID'.lower():
                        csvRowString += "\"" + song.id + "\""
                    elif attribute == 'Tempo'.lower():
                        # print "Tempo: " + song.tempo
                        csvRowString += song.tempo
                    elif attribute == 'TimeSignature'.lower():
                        csvRowString += song.timeSignature
                    elif attribute == 'TimeSignatureConfidence'.lower():
                        # print "time sig conf: " + song.timeSignatureConfidence
                        csvRowString += song.timeSignatureConfidence
                    elif attribute == 'Title'.lower():
                        csvRowString += "\"" + song.title + "\""
                    elif attribute == 'Year'.lower():
                        csvRowString += song.year

                    else:
                        csvRowString += "Erm. This didn't work. Error. :( :(\n"

                    csvRowString += ","

            #Remove the final comma from each row in the csv
                lastIndex = len(csvRowString)
                csvRowString = csvRowString[0:lastIndex - 1]
                csvRowString += "\n"
                outputFile1.write(csvRowString)
                csvRowString = ""

            songH5File.close()

    outputFile1.close()
                # to one of 196 categories for a chord shift
                chord_shift = 12*(key_shift - 1) + note_shift
                chord_changes[chord_shift] += 1
                # print ('chord shift: {0}'.format(chord_shift))
            pitch_segs_data.append(chord_changes)
            '''Now smooth out the timbre segments based on BPM'''
            segments_timbre_old = hdf5_getters.get_segments_timbre(h5)
            segments_timbre_smoothed = []
            for i in range(0,int(math.floor(len(timbre_pitches_old))/smoothing_factor)):
                segments = segments_timbre_old[(smoothing_factor*i):(smoothing_factor*(i+1))]
                # calculate mean frequency of each note over a block of 5 time segments
                segments_mean = map(mean, zip(*segments))
                segments_timbre_smoothed.append(segments_mean)
            print ('song count: {0}'.format(count+1))
            h5_subdict = dict()
            h5_subdict['title'] = hdf5_getters.get_title(h5)
            h5_subdict['artist_name'] = hdf5_getters.get_artist_name(h5)
            h5_subdict['year'] = hdf5_getters.get_year(h5)
            h5_subdict['chord_changes'] = chord_changes
            h5_subdict['duration'] = hdf5_getters.get_duration(h5)
            h5_subdict['timbre'] = segments_timbre_smoothed
            track_id = hdf5_getters.get_track_id(h5)
            all_song_data[track_id] = h5_subdict
            print('Song {0} finished processing. Total time elapsed: {1} seconds'.format(count,str(time.time() - start_time)))
        h5.close() 

all_song_data_sorted = dict(sorted(all_song_data.items(), key=lambda k: k[1]['year'])) 
sortedpitchdata = 'output/msd_EDM_sortedpitchdata_' + re.sub('/','',sys.argv[1]) + '.txt'
with open(sortedpitchdata, 'w') as text_file:
    text_file.write(str(all_song_data_sorted))