def _read_artist_mappings_from_file(file_obj): """Read the artist mappings from a file. Args: file_obj: A file object to read the mappings from Returns: A 2-tuple containing * A mapping dict, which carries canonicalized strings to canonicalized strings * A "raw mapping dict, which gives the mapping exactly as described in the file, without any canonicalization. """ raw_mappings = {} mappings = {} for line in file_obj: line = line.strip() if not line or line.startswith("#"): continue before, sep, after = line.partition(_MAPPINGS_SEP) if not sep: sys.stderr.write("Skipping invalid mapping: \"%s\"\n" % line.encode("utf-8")) continue before = before.strip() after = after.strip() raw_mappings[before] = after canon_before = similarity.canonicalize_string(before) canon_after = similarity.canonicalize_string(after) mappings[canon_before] = canon_after return mappings, raw_mappings
def merge_whitelist_and_mappings(whitelist, raw_mappings): """Combine information from whitelist and mappings. Args: whitelist: A whitelist dict raw_mappings: A raw mappings dict Returns: A (whitelist, raw_mapping) pair that is equivalent to the args but with certain normalizations applied that take information from the mappings and applies it back to the whitelist, thereby correcting any inconsistencies. """ new_whitelist = dict(whitelist) inv_whitelist = dict((v, k) for k, v in whitelist.iteritems()) new_raw_mappings = {} for before, after in raw_mappings.iteritems(): std_before = _standardize(before, whitelist, {}) std_after = _standardize(after, whitelist, {}) # Every "after" should exactly match a whitelist item. if after != std_after: if std_after is not None: # Delete the whitelist entry that created the non-matching # standardization of after. try: del new_whitelist[inv_whitelist[std_after]] except KeyError: pass # A "before" item in the mappings should never exactly match # an existing whitelist entry. If it does, delete it from # the whitelist. canon_before = similarity.canonicalize_string(before) if canon_before in new_whitelist: del new_whitelist[canon_before] # Insert the "after" form into the new whitelist. canon_after = similarity.canonicalize_string(after) new_whitelist[canon_after] = after # If we can figure out a mapping based solely on the whitelist, # the mapping can be dropped. if std_before and std_before == std_after: continue new_raw_mappings[before] = after # If the whitelist and mappings remained stable under these operations, # return them. if new_whitelist == whitelist and new_raw_mappings == raw_mappings: return new_whitelist, new_raw_mappings # If something did change, call self recursively on the results. return merge_whitelist_and_mappings(new_whitelist, new_raw_mappings)
def _seq_to_whitelist(seq_of_names): new_whitelist = {} for name in seq_of_names: canon = similarity.canonicalize_string(name) if canon in new_whitelist: sys.stderr.write("Artist whitelist collision: \"%s\" and \"%s\"\n" % (new_whitelist[canon], name)) return None new_whitelist[canon] = name return new_whitelist
def test_basic(self): test_cases = ( ("", u""), (" ", u""), (u"foo", u"foo"), ("foo. bar.", u"foobar"), ("foo & Bar ", u"foo&bar"), ("The Foo and Bar", u"foo&bar"), ("Foo!!!", u"foo"), ("!!!!", u"!!!!"), ) for before, after in test_cases: self.assertEqual(after, similarity.canonicalize_string(before))
def _standardize_simple(artist_name, whitelist, mappings): """Attempt to standardize an artist name using only "simple" methods. Args: artist_name: A unicode string containing an artist's name whitelist: A whitelist dict that maps canonicalized names to names mappings: A mappings dict whose keys and values are both canonicalized artist names Returns: A string containing the standardized form of the artist name, or None if the name is not recognized. """ canon_name = similarity.canonicalize_string(artist_name) # We just try to look up the canonicalized form of the artist name # in both the whitelist and mapping dicts. if canon_name in whitelist: return whitelist[canon_name] elif canon_name in mappings: return whitelist.get(mappings[canon_name]) else: return None
def suggest(name): canon_name = similarity.canonicalize_string(name) _global_lock.acquire() try: canon_whitelist = list(_global_whitelist) finally: _global_lock.release() best_guess = None # We ignore any items that are more than 10 edits away from our # original name. MAX_DIST = 10 MAX_NORM_DIST = 0.25 best_dist = 1e+100 for guess in canon_whitelist: normalizer = (len(guess)+len(canon_name)/2.0) max_value = min(MAX_DIST, int(1+normalizer*MAX_NORM_DIST)) lev_dist = similarity.get_levenshtein_distance( canon_name, guess, max_value=max_value) if lev_dist < MAX_DIST: normalized_lev_dist = lev_dist / normalizer if normalized_lev_dist < MAX_NORM_DIST: best_guess = guess best_dist = normalized_lev_dist return _global_whitelist.get(best_guess)