def test_empty_jobs(self): checkpoints = ProgressCheckpoints(0, 1) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), []) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), []) checkpoints = ProgressCheckpoints(0, 0) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), []) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), []) checkpoints = ProgressCheckpoints(1, 0) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), []) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), [])
def test_uniformly_spaced_fractional_distance(self): checkpoints = ProgressCheckpoints(100, 7) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), [14, 28, 42, 57, 71, 85, 99]) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), [14, 28, 42, 57, 71, 85, 100]) checkpoints = ProgressCheckpoints(10, 20) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), [5, 15, 25, 35, 45, 55, 65, 75, 85, 100]) checkpoints = ProgressCheckpoints(5, 10) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), [0, 1, 2, 3, 4]) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), [10, 30, 50, 70, 100])
def cluster(self, threshold, tagger=None): # Keep the matches sorted in a heap heap = [] num_files = self.cluster_dict.get_size() # 20 evenly spaced indexes of files being clustered, used as checkpoints for every 5% progress status_update_steps = ProgressCheckpoints(num_files, 20) for y in process_events_iter(range(num_files)): token_y = self.cluster_dict.get_token(y).lower() for x in range(y): if x != y: token_x = self.cluster_dict.get_token(x).lower() c = similarity(token_x, token_y) if c >= threshold: heappush(heap, ((1.0 - c), [x, y])) word, count = self.cluster_dict.get_word_and_count(y) if word and count > 1: self.cluster_bins[self.cluster_count] = [y] self.index_id_cluster[y] = self.cluster_count self.cluster_count = self.cluster_count + 1 if tagger and status_update_steps.is_checkpoint(y): statusmsg = N_( "Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)" ) mparams = { 'step': self.cluster_type.value, 'cluster_type': _(self._cluster_type_label()), 'update': status_update_steps.progress(y), } tagger.window.set_statusbar_message(statusmsg, mparams) for i in range(len(heap)): c, pair = heappop(heap) c = 1.0 - c try: match0 = self.index_id_cluster[pair[0]] except BaseException: match0 = -1 try: match1 = self.index_id_cluster[pair[1]] except BaseException: match1 = -1 # if neither item is in a cluster, make a new cluster if match0 == -1 and match1 == -1: self.cluster_bins[self.cluster_count] = [pair[0], pair[1]] self.index_id_cluster[pair[0]] = self.cluster_count self.index_id_cluster[pair[1]] = self.cluster_count self.cluster_count = self.cluster_count + 1 continue # If cluster0 is in a bin, stick the other match into that bin if match0 >= 0 and match1 < 0: self.cluster_bins[match0].append(pair[1]) self.index_id_cluster[pair[1]] = match0 continue # If cluster1 is in a bin, stick the other match into that bin if match1 >= 0 and match0 < 0: self.cluster_bins[match1].append(pair[0]) self.index_id_cluster[pair[0]] = match1 continue # If both matches are already in two different clusters, merge the clusters if match1 != match0: self.cluster_bins[match0].extend(self.cluster_bins[match1]) for match in self.cluster_bins[match1]: self.index_id_cluster[match] = match0 del self.cluster_bins[match1]
def cluster(files, threshold, tagger=None): config = get_config() win_compat = config.setting["windows_compatibility"] or IS_WIN artist_dict = ClusterDict() album_dict = ClusterDict() tracks = [] num_files = len(files) # 10 evenly spaced indexes of files being clustered, used as checkpoints for every 10% progress status_update_steps = ProgressCheckpoints(num_files, 10) for i, file in process_events_iter(enumerate(files)): artist = file.metadata["albumartist"] or file.metadata["artist"] album = file.metadata["album"] # Improve clustering from directory structure if no existing tags # Only used for grouping and to provide cluster title / artist - not added to file tags. if win_compat: filename = ntpath.splitdrive(file.filename)[1] else: filename = file.filename album, artist = album_artist_from_path(filename, album, artist) # For each track, record the index of the artist and album within the clusters tracks.append((artist_dict.add(artist), album_dict.add(album))) if tagger and status_update_steps.is_checkpoint(i): statusmsg = N_( "Clustering - step %(step)d/3: %(cluster_type)s (%(update)d%%)" ) mparams = { 'step': ClusterType.METADATA.value, 'cluster_type': _(ClusterEngine.cluster_type_label(ClusterType.METADATA)), 'update': status_update_steps.progress(i), } tagger.window.set_statusbar_message(statusmsg, mparams) artist_cluster_engine = ClusterEngine(artist_dict, ClusterType.ARTIST) artist_cluster_engine.cluster(threshold, tagger) album_cluster_engine = ClusterEngine(album_dict, ClusterType.ALBUM) album_cluster_engine.cluster(threshold, tagger) # Arrange tracks into albums albums = {} for i, track in enumerate(tracks): cluster = album_cluster_engine.get_cluster_from_id(track[1]) if cluster is not None: albums.setdefault(cluster, []).append(i) # Now determine the most prominent names in the cluster and build the # final cluster list for album_id, album in albums.items(): album_name = album_cluster_engine.get_cluster_title(album_id) artist_max = 0 artist_id = None artist_hist = {} for track_id in album: cluster = artist_cluster_engine.get_cluster_from_id( tracks[track_id][0]) if cluster is not None: cnt = artist_hist.get(cluster, 0) + 1 if cnt > artist_max: artist_max = cnt artist_id = cluster artist_hist[cluster] = cnt if artist_id is None: artist_name = "Various Artists" else: artist_name = artist_cluster_engine.get_cluster_title( artist_id) yield album_name, artist_name, (files[i] for i in album)
def test_uniformly_spaced_integer_distance(self): checkpoints = ProgressCheckpoints(100, 10) self.assertEqual(list(sorted(checkpoints._checkpoints.keys())), [10, 20, 30, 40, 50, 60, 70, 80, 90, 99]) self.assertEqual(list(sorted(checkpoints._checkpoints.values())), [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])