def getclusters(basedir,ext='.h5') :
    print 'inside clusters'
    features = []
    cfeatures = [] 
    decadecount = defaultdict(int)
    deccount = defaultdict(int)
    i=0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            year = hdf5_getters.get_year(h5)
            if year == 0:
                h5.close()
                continue
            bins = getbin(year)
#            decade.append(bin)
#            dec = (year/10)*10
#            decadecount[dec] += 1
#            if dec < 1960:
#                h5.close()
#                continue
             
            deccount[bin] += 1
            if decadecount[bins] > cap:
                flag = checkforcompletion(decadecount)
                h5.close()
                if flag:
                    for dec in decadecount.keys():
                        print 'Dec : ' + str(dec) + ' Count : ' + str(decadecount[dec])
                    return features,cfeatures
                continue
            i += 1
            print i
            try:
                bttimbre = bt.get_bttimbre(h5)
                btT = bttimbre.T
                for x in btT:
                    features.append(x)
                decadecount[bins] += 1
                btchroma = bt.get_btchromas(h5)
                btc = btchroma.T
                for x in btc:
                    cfeatures.append(x)
                decadecount[bins] += 1
            except:
                h5.close()
                continue
            h5.close()
    for dec in deccount.keys():
        print 'Dec : ' + str(dec) + ' Count : ' + str(decadecount[dec])
    features = array(features)
    cfeatures = array(cfeatures)
    return features,cfeatures
def beat_aligned_chroma(song_id, file_name) :
  btchromas = beat_aligned_feats.get_btchromas(file_name)
  if btchromas is None :
    return [], None
  btchromas = btchromas.T
  rows = []
  for i in range(btchromas.shape[0]) :
    vals = {'song_id': song_id, 'beat_number': i}
    for j in range(12) :
      vals[pitch_schema[j]] = btchromas[i][j]
    rows.append(vals)
  return rows, btchromas
    # sanity checks
    if not os.path.isfile(songfile):
        print 'ERROR: %s does not exist.' % songfile
        sys.exit(0)

    # tbm path, import stuff
    import beat_aligned_feats as BAF
    import pylab as P
    import warnings
    warnings.filterwarnings('ignore', category=DeprecationWarning)

    # get chroma
    btchroma = BAF.get_btchromas_loudness(songfile)
    btchroma_db = np.log10(BAF.get_btchromas_loudness(songfile)) * 20.
    btchroma_normal = BAF.get_btchromas(songfile)

    # get landmarks
    landmarks = get_landmarks(btchroma, decay=decay)
    landmarks_normal = get_landmarks(btchroma_normal, decay=decay)

    # plot
    pargs = {'aspect': 'auto',
             'cmap': P.cm.gray_r,
             'interpolation': 'nearest',
             'origin': 'lower'}
    P.subplot(4, 1, 1)
    P.imshow(btchroma, **pargs)
    P.subplot(4, 1, 2)
    P.imshow(landmarks_normal, **pargs)
def buildfeatures(basedir,cluster,ccluster,ext='.h5'):
    
    global cap
    i = 0
    features = []
    decade = []
    decadecount = defaultdict(int)
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            year = hdf5_getters.get_year(h5)
            if year == 0:
                h5.close()
                continue
            bins = getbin(year)
#            dec = (year/10)*10
#            if dec < 1960:
#                h5.close()
#                continue
            if decadecount[bins] > cap:
                flag = checkforcompletion(decadecount)
                h5.close()
                if flag:
                    return features,decade
                continue
            i += 1
            print i
            
            clustercount = {}
            cclustercount = {}
            
            for x in range(50):
                clustercount[x] = 0
                cclustercount[x] = 0 
            
            feature = []
            cfeature =[]
            
            try:
                bttimbre = bt.get_bttimbre(h5)
                btT = bttimbre.T
                for x in btT:
                    label = kmeans.predict(x)
                    clustercount[label[0]] += 1
                for cl in clustercount.keys():
                    feature.append(clustercount[cl])
             
                
                btchroma = bt.get_btchromas(h5)
                btc = btchroma.T
                for y in btc :
                    clabel = ckmeans.predict(y)
                    cclustercount[clabel[0]] +=1
                for cl in cclustercount.keys():
                    feature.append(cclustercount[cl])
                
                features.append(feature)
                
                decade.append(bins)
                decadecount[bins] += 1
            
            except:
                h5.close()
                continue
            h5.close()
            
    print len(features)
    print len(decade)
    return features,decade
    # sanity checks
    if not os.path.isfile(songfile):
        print 'ERROR: %s does not exist.' % songfile
        sys.exit(0)

    # tbm path, import stuff
    import beat_aligned_feats as BAF
    import pylab as P
    import warnings
    warnings.filterwarnings('ignore', category=DeprecationWarning)

    # get chroma
    btchroma = BAF.get_btchromas_loudness(songfile)
    btchroma_db = np.log10(BAF.get_btchromas_loudness(songfile)) * 20.
    btchroma_normal = BAF.get_btchromas(songfile)

    # get landmarks
    landmarks = get_landmarks(btchroma, decay=decay)
    landmarks_normal = get_landmarks(btchroma_normal, decay=decay)

    # plot
    pargs = {
        'aspect': 'auto',
        'cmap': P.cm.gray_r,
        'interpolation': 'nearest',
        'origin': 'lower'
    }
    P.subplot(4, 1, 1)
    P.imshow(btchroma, **pargs)
    P.subplot(4, 1, 2)
def get_all_titles(basedir,ext='.h5') :
    
    global errorcount
    global count
    global cap
    global truecount
    
    features = []
    decade = []
    decadecount = defaultdict(int)
    timbre = None
    i = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            feature = []
            try:
                h5 = hdf5_getters.open_h5_file_read(f)
            except HDF5ExtError as e:
                errorcount += 1
                print "Unexpected error:", sys.exc_info()[0]
                print traceback.format_exc()
                continue
            year = hdf5_getters.get_year(h5)
            print i
            i+=1
            if year == 0:
                h5.close()
                continue
            label = getbin(year)
            #label = (year/10)*10
            truecount[label] += 1
            if decadecount[label] > cap:
                flag = checkforcompletion(decadecount)
                h5.close()
                if flag:
                    for dec in decadecount.keys():
                        print 'Decade : ' + str(dec) + ' Count : ' + str(decadecount[dec])
                    return features,decade
                continue
#            dec = (year/10)*10
#            if dec < 1960:
#                h5.close()
#                continue
            
#            try:
#                
#                bttimbre = bt.get_bttimbre(h5)
#                timbres = bttimbre.argmax(axis = 0) + 1   # Is a vector of timbre values sutiable for training an HMM
#                for timbre in timbres:
#                    timbredict[timbre] += 1
#                for i in range(1,13):
#                    feature.append(timbredict[i])
#            except:
#                h5.close()
#                continue
#            clustercount = {}
#            for x in range(12):
#                clustercount[x] = 0
   
            
#            try:
#                bttimbre = bt.get_bttimbre(h5)
#                btT = bttimbre.T
#                for x in btT:
#                    timbre = x.argmax(axis = 0)
#                    clustercount[timbre]+=1  
#            except:
#                h5.close()
#                continue
#            for y in range(12):
#                features.append(clustercount[y])

            try:
                btchromas = bt.get_btchromas(h5)
                for chroma in btchromas:
                    feature.append(mean(chroma))
                covmat = get_covariance(btchromas)
                feature.extend(covmat)
                bttimbre = bt.get_bttimbre(h5)
                for timbre in bttimbre:
                    feature.append(mean(timbre))
                covmat = get_covariance(bttimbre)
                feature.extend(covmat)
                
#                btT = bttimbre.T
#                for x in btT:
#                    timbre = x.argmax(axis = 0)
#                    clustercount[timbre]+=1 
#                for y in range(12):
#                    feature.append(clustercount[y])
            except:
                errorcount += 1
                h5.close()
                continue
            loudness = hdf5_getters.get_loudness(h5)
            feature.append(loudness)
            duration = hdf5_getters.get_duration(h5)
            feature.append(duration)
            features.append(feature)
            decade.append(label)
            decadecount[label] += 1
            count += 1
            h5.close()
#            title = hdf5_getters.get_title(h5)
#            segstarts = hdf5_getters.get_segments_start(h5)
#            segstarts = np.array(segstarts).flatten()
#            btstarts = hdf5_getters.get_beats_start(h5)
#            btstarts = np.array(btstarts).flatten()

    for dec in decadecount.keys():
        print 'Decade : ' + str(dec) + ' Count : ' + str(decadecount[dec])
    return features,decade
    # Load in list of files which were aligned correctly, and the start/end times of the good alignment
    files, start_times, end_times = load_results(tsv_path)

    for filename, start_time, end_time in zip(files, start_times, end_times):
        # Load in MSD hdf5 file
        h5 = hdf5_getters.open_h5_file_read(to_h5_path(filename))
        # Load in beat times from MSD
        beats = hdf5_getters.get_beats_start(h5)
        # Some files have no EN analysis
        if beats.size == 0:
            continue
        # Get indices which fall within the range of correct alignment
        time_mask = np.logical_and(beats > start_time, beats < end_time)
        beats = beats[time_mask]
        # and beat-synchronous feature matrices, within the time range of correct alignment
        chroma = beat_aligned_feats.get_btchromas(h5)[:, time_mask]
        timbre = beat_aligned_feats.get_bttimbre(h5)[:, time_mask]
        loudness = beat_aligned_feats.get_btloudnessmax(h5)[:, time_mask]
        h5.close()
        # Stack it
        msd_features = np.vstack([chroma, timbre, loudness])
        if np.isnan(msd_features).any():
            print filename
            continue
        # Load in pretty midi object
        pm = pretty_midi.PrettyMIDI(midi.read_midifile(to_midi_path(filename)))
        # Construct piano roll, aligned to the msd beat times
        piano_roll = pm.get_piano_roll(times=beats)
        # Ignore notes below 36 and above 84
        piano_roll = piano_roll[36:84, :]
        # Write out