It is convenient to work with
the pandas data frame for some plots
'''
modeler = Modeler()
df = modeler.df
'''
Get feature mask data and labels so we know what to plot
'''
all_masks = modeler.extract_frequencies_and_indeces()
frequency_bins = all_masks[-2]
'''Start plotting'''
make_cluster_purity_plots(components, labels)
conf_mat = plot_supervised_confusion_matrix(all_tests, all_predicts)
make_acc_prec_rec_plots(conf_mat)
mask_labels = ['Energy', 'Beat Strength',\
'<Beat Separation>', '$Med(Beat\ \ Separation)$',\
'$Std(Beat\ \ Separation)$', 'ZCR data', 'Total energy']
features_of_interest =\
plot_feature_importance(all_importances, all_masks[:-2], mask_labels, all_masks[-1])
'''We plot the top 5 features of interest in our scatter plots'''
make_scatter_plots(features_of_interest[0:5], df)
'''Make more detailed mask labels'''
mask_labels = ['Energy (scaled to total song energy)', 'Normalized Beat Strength',\
'Normalized <Beat Separation>', '$Med(Beat\ \ Separation)$',\
'$Std(Beat\ \ Separation)$', 'ZCR data', 'Total energy']
make_feature_vs_frequency_plots(all_masks, mask_labels, X_unscaled,
                                frequency_bins, labels)
reader = WaveRead()
sample_rate, recording = reader.get_recording('Tupac- Smile.wav')
plot_signal_vs_time_data(recording, sample_rate)
    for each song; labels = list of corresponding genre labels (strings) for each song
    '''    
    foo = zip(components, labels)
    struct_array = np.array([np.array([element[0], element[1]]) for element in foo])
    for name in set(struct_array[:,1]):
        mask = struct_array[:,1] == name
        print name
        print Counter(struct_array[mask][:,0])
    
'''
conduct feature generation using waveread, 
this takes several hours    
'''
file_convert = False    
if file_convert:        
    reader = WaveRead(60000, .05)
    reader.convert_all()
    
    
    
'''this creates the supervised and unsupervised models'''
conduct_model = True
if conduct_model:
    modeler = Modeler()
    '''run supervised/unsupervised models here'''
    '''
    OUTPUTS BELOW: components = 1-d numpy array of kmeans cluster labels; 
    song_names = 1-d numpy array of song names; 
        song_labels =1-d numpy array of genre labels ; 
        components2 = 1-d numpy array of kmeans/nmf cluster labels; 
        all_reports = list of confusion matrices (2-d numpy arrays) from each KFold output; 
the pandas data frame for some plots
'''
modeler = Modeler()
df = modeler.df
'''
Get feature mask data and labels so we know what to plot
'''
all_masks = modeler.extract_frequencies_and_indeces()
frequency_bins = all_masks[-2]
 
'''Start plotting'''
make_cluster_purity_plots(components, labels)
conf_mat = plot_supervised_confusion_matrix(all_tests, all_predicts)
make_acc_prec_rec_plots(conf_mat)
mask_labels = ['Energy', 'Beat Strength',\
'<Beat Separation>', '$Med(Beat\ \ Separation)$',\
'$Std(Beat\ \ Separation)$', 'ZCR data', 'Total energy']
features_of_interest =\
plot_feature_importance(all_importances, all_masks[:-2], mask_labels, all_masks[-1])
'''We plot the top 5 features of interest in our scatter plots'''
make_scatter_plots(features_of_interest[0:5], df)
'''Make more detailed mask labels'''
mask_labels = ['Energy (scaled to total song energy)', 'Normalized Beat Strength',\
'Normalized <Beat Separation>', '$Med(Beat\ \ Separation)$',\
'$Std(Beat\ \ Separation)$', 'ZCR data', 'Total energy']
make_feature_vs_frequency_plots(all_masks, mask_labels, X_unscaled, frequency_bins, labels)
reader = WaveRead()
sample_rate, recording = reader.get_recording('Tupac- Smile.wav')
plot_signal_vs_time_data(recording, sample_rate)