def get_kmeans_clusters(data, n_clusters, songs_by_cluster={}):  # pylint: disable=dangerous-default-value
    '''
    Assigns songs in data to :n_clusters: unique clusters.

    FUTURE: Refactor feature engineering such that there data
            isn't copied wholesale. E.g., implement multiple
            kinds of features or better feature filtering.
            This will require changes to Classifier.

    :param data:        the data, obtained from read_data()
    :param n_clusters:  the number of clusters used in K-means

    :return             deep copy of data with cluster property and clusters as boolean features
                        dict, n_clusters:cluster:ids_of_songs_in_cluster
    '''

    data = dict(data)

    if n_clusters in songs_by_cluster.keys():
        raise ValueError(str(n_clusters) + ' clusters already computed')

    songs_by_cluster[n_clusters] = {i: set() for i in range(n_clusters)}

    train_data, track_ids = {k: v for k, v in data.items()}, list(data)

    x_train = np.array(
        [list(x['features'].values()) for x in train_data.values()])
    y_train = [x['label'] for x in train_data.values()]

    km_classifier = KMeans(n_clusters,
                           init='random',
                           max_iter=300,
                           random_state=0,
                           n_init=30)
    km_classifier.fit(x_train)

    cluster_map = pd.DataFrame()

    cluster_map['data'], cluster_map['cluster'] = x_train.tolist(
    ), km_classifier.labels_
    cluster_map['label'], cluster_map['track_id'] = y_train, track_ids

    for track_id in track_ids:

        data[track_id] = deepcopy(data[track_id])

        cluster = cluster_map[cluster_map['track_id'] ==
                              track_id]['cluster'].tolist()[0]

        songs_by_cluster[n_clusters][cluster].add(track_id)

        data[track_id]['cluster'] = cluster
        for i in range(n_clusters):
            data[track_id]['features']['c' + str(i)] = 1 if cluster == i else 0

    return FrozenMap(data), FrozenMap(songs_by_cluster)
def split_data(data, proportion):
    '''
    Splits data into training and validation sets for simple classification.

    :param data:    complete labeled data
    :param p:       proportion of data to use for validation

    :return         train_data, validation_data
    '''
    validation_ids = rand_sample(list(data), int(len(data) * proportion))

    validation_data = {k: data[k] for k in validation_ids}
    train_data = {k: v for k, v in data.items() if k not in validation_data}

    return FrozenMap(train_data), FrozenMap(validation_data)
Example #3
0
 def test_setitem_TypeError(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     with self.assertRaises(TypeError) as context:
         rgb['blue'] = 'topaz'
     self.assertEqual(
         str(context.exception),
         "'FrozenMap' object does not support item assignment")
     with self.assertRaises(TypeError) as context:
         rgb['grey'] = 'pewter'
     self.assertEqual(
         str(context.exception),
         "'FrozenMap' object does not support item assignment")
def get_playlist_features(SPOTIPY_OBJECT, playlist_data, playlist_name):

    # featurize playlist song data
    playlist_df = get_dataframe(SPOTIPY_OBJECT, playlist_data, -1)

    scaled_playlist_df = scale_data(playlist_df)
    scaled_playlist_df.to_csv('data/{}.csv'.format(playlist_name),
                              encoding='utf-8')

    playlist_data_dict = FrozenMap(
        read_data('data/{}.csv'.format(playlist_name), False))
    return get_features_and_id(playlist_data_dict), playlist_data_dict
def filter_features(data, discard):
    '''
    Filters out features from data. Does not modify passed-in object (creates a copy).

    :param data:        dict of data, same format as song_data
    :param discard:     feature names to discard

    :return             copy of data, with filtered features
    '''
    out = dict(data)

    for id_ in data:
        out[id_] = deepcopy(out[id_])
        out[id_]['features'] = {
            k: v
            for k, v in out[id_]['features'].items() if k not in discard
        }

    return FrozenMap(out)
Example #6
0
 def test_iter(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     self.assertEqual(frozenset(iter(rgb)), frozenset(['red', 'green', 'blue']))
Example #7
0
 def test_create(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     self.assertIsInstance(rgb, FrozenMap)
     self.assertIn('red', rgb)
     self.assertTrue('green', rgb)
     self.assertTrue('blue', rgb)
Example #8
0
 def test_hash(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     self.assertIsInstance(hash(rgb), int)
Example #9
0
 def test_len(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     self.assertEqual(len(rgb), 3)
Example #10
0
 def test_getitem_KeyError(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     with self.assertRaises(KeyError) as context:
         rgb['grey']
     self.assertEqual(str(context.exception), "'grey'")
Example #11
0
 def test_getitem(self):
     rgb = FrozenMap(red='rouge', green='forest', blue='azul')
     self.assertEqual(rgb['red'], 'rouge')