def get_kmeans_clusters(data, n_clusters, songs_by_cluster={}): # pylint: disable=dangerous-default-value ''' Assigns songs in data to :n_clusters: unique clusters. FUTURE: Refactor feature engineering such that there data isn't copied wholesale. E.g., implement multiple kinds of features or better feature filtering. This will require changes to Classifier. :param data: the data, obtained from read_data() :param n_clusters: the number of clusters used in K-means :return deep copy of data with cluster property and clusters as boolean features dict, n_clusters:cluster:ids_of_songs_in_cluster ''' data = dict(data) if n_clusters in songs_by_cluster.keys(): raise ValueError(str(n_clusters) + ' clusters already computed') songs_by_cluster[n_clusters] = {i: set() for i in range(n_clusters)} train_data, track_ids = {k: v for k, v in data.items()}, list(data) x_train = np.array( [list(x['features'].values()) for x in train_data.values()]) y_train = [x['label'] for x in train_data.values()] km_classifier = KMeans(n_clusters, init='random', max_iter=300, random_state=0, n_init=30) km_classifier.fit(x_train) cluster_map = pd.DataFrame() cluster_map['data'], cluster_map['cluster'] = x_train.tolist( ), km_classifier.labels_ cluster_map['label'], cluster_map['track_id'] = y_train, track_ids for track_id in track_ids: data[track_id] = deepcopy(data[track_id]) cluster = cluster_map[cluster_map['track_id'] == track_id]['cluster'].tolist()[0] songs_by_cluster[n_clusters][cluster].add(track_id) data[track_id]['cluster'] = cluster for i in range(n_clusters): data[track_id]['features']['c' + str(i)] = 1 if cluster == i else 0 return FrozenMap(data), FrozenMap(songs_by_cluster)
def split_data(data, proportion): ''' Splits data into training and validation sets for simple classification. :param data: complete labeled data :param p: proportion of data to use for validation :return train_data, validation_data ''' validation_ids = rand_sample(list(data), int(len(data) * proportion)) validation_data = {k: data[k] for k in validation_ids} train_data = {k: v for k, v in data.items() if k not in validation_data} return FrozenMap(train_data), FrozenMap(validation_data)
def test_setitem_TypeError(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') with self.assertRaises(TypeError) as context: rgb['blue'] = 'topaz' self.assertEqual( str(context.exception), "'FrozenMap' object does not support item assignment") with self.assertRaises(TypeError) as context: rgb['grey'] = 'pewter' self.assertEqual( str(context.exception), "'FrozenMap' object does not support item assignment")
def get_playlist_features(SPOTIPY_OBJECT, playlist_data, playlist_name): # featurize playlist song data playlist_df = get_dataframe(SPOTIPY_OBJECT, playlist_data, -1) scaled_playlist_df = scale_data(playlist_df) scaled_playlist_df.to_csv('data/{}.csv'.format(playlist_name), encoding='utf-8') playlist_data_dict = FrozenMap( read_data('data/{}.csv'.format(playlist_name), False)) return get_features_and_id(playlist_data_dict), playlist_data_dict
def filter_features(data, discard): ''' Filters out features from data. Does not modify passed-in object (creates a copy). :param data: dict of data, same format as song_data :param discard: feature names to discard :return copy of data, with filtered features ''' out = dict(data) for id_ in data: out[id_] = deepcopy(out[id_]) out[id_]['features'] = { k: v for k, v in out[id_]['features'].items() if k not in discard } return FrozenMap(out)
def test_iter(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') self.assertEqual(frozenset(iter(rgb)), frozenset(['red', 'green', 'blue']))
def test_create(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') self.assertIsInstance(rgb, FrozenMap) self.assertIn('red', rgb) self.assertTrue('green', rgb) self.assertTrue('blue', rgb)
def test_hash(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') self.assertIsInstance(hash(rgb), int)
def test_len(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') self.assertEqual(len(rgb), 3)
def test_getitem_KeyError(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') with self.assertRaises(KeyError) as context: rgb['grey'] self.assertEqual(str(context.exception), "'grey'")
def test_getitem(self): rgb = FrozenMap(red='rouge', green='forest', blue='azul') self.assertEqual(rgb['red'], 'rouge')