Example #1
0
def _read_artist_mappings_from_file(file_obj):
    """Read the artist mappings from a file.

    Args:
      file_obj: A file object to read the mappings from

    Returns:
      A 2-tuple containing
        * A mapping dict, which carries canonicalized strings to
          canonicalized strings
        * A "raw mapping dict, which gives the mapping exactly as
          described in the file, without any canonicalization.
    """
    raw_mappings = {}
    mappings = {}
    for line in file_obj:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        before, sep, after = line.partition(_MAPPINGS_SEP)
        if not sep:
            sys.stderr.write("Skipping invalid mapping: \"%s\"\n"
                             % line.encode("utf-8"))
            continue
        before = before.strip()
        after = after.strip()
        raw_mappings[before] = after

        canon_before = similarity.canonicalize_string(before)
        canon_after = similarity.canonicalize_string(after)
        mappings[canon_before] = canon_after

    return mappings, raw_mappings
Example #2
0
def merge_whitelist_and_mappings(whitelist, raw_mappings):
    """Combine information from whitelist and mappings.

    Args:
      whitelist: A whitelist dict
      raw_mappings: A raw mappings dict
      
    Returns:
      A (whitelist, raw_mapping) pair that is equivalent to the args
      but with certain normalizations applied that take information
      from the mappings and applies it back to the whitelist, thereby
      correcting any inconsistencies.
    """
    new_whitelist = dict(whitelist)
    inv_whitelist = dict((v, k) for k, v in whitelist.iteritems())
    new_raw_mappings = {}
    for before, after in raw_mappings.iteritems():
        std_before = _standardize(before, whitelist, {})
        std_after = _standardize(after, whitelist, {})
        # Every "after" should exactly match a whitelist item.
        if after != std_after:
            if std_after is not None:
                # Delete the whitelist entry that created the non-matching
                # standardization of after.
                try:
                    del new_whitelist[inv_whitelist[std_after]]
                except KeyError:
                    pass
            # A "before" item in the mappings should never exactly match
            # an existing whitelist entry.  If it does, delete it from
            # the whitelist.
            canon_before = similarity.canonicalize_string(before)
            if canon_before in new_whitelist:
                del new_whitelist[canon_before]
            # Insert the "after" form into the new whitelist.
            canon_after = similarity.canonicalize_string(after)
            new_whitelist[canon_after] = after
        # If we can figure out a mapping based solely on the whitelist,
        # the mapping can be dropped.
        if std_before and std_before == std_after:
            continue
        new_raw_mappings[before] = after

    # If the whitelist and mappings remained stable under these operations,
    # return them.
    if new_whitelist == whitelist and new_raw_mappings == raw_mappings:
        return new_whitelist, new_raw_mappings
    # If something did change, call self recursively on the results.
    return merge_whitelist_and_mappings(new_whitelist, new_raw_mappings)
Example #3
0
def _seq_to_whitelist(seq_of_names):
    new_whitelist = {}
    for name in seq_of_names:
        canon = similarity.canonicalize_string(name)
        if canon in new_whitelist:
            sys.stderr.write("Artist whitelist collision: \"%s\" and \"%s\"\n"
                             % (new_whitelist[canon], name))
            return None
        new_whitelist[canon] = name
    return new_whitelist
 def test_basic(self):
     test_cases = (
         ("", u""),
         ("   ", u""),
         (u"foo", u"foo"),
         ("foo. bar.", u"foobar"),
         ("foo &   Bar  ", u"foo&bar"),
         ("The Foo and Bar", u"foo&bar"),
         ("Foo!!!", u"foo"),
         ("!!!!", u"!!!!"),
         )
     for before, after in test_cases:
         self.assertEqual(after, similarity.canonicalize_string(before))
Example #5
0
 def test_basic(self):
     test_cases = (
         ("", u""),
         ("   ", u""),
         (u"foo", u"foo"),
         ("foo. bar.", u"foobar"),
         ("foo &   Bar  ", u"foo&bar"),
         ("The Foo and Bar", u"foo&bar"),
         ("Foo!!!", u"foo"),
         ("!!!!", u"!!!!"),
     )
     for before, after in test_cases:
         self.assertEqual(after, similarity.canonicalize_string(before))
Example #6
0
def _standardize_simple(artist_name, whitelist, mappings):
    """Attempt to standardize an artist name using only "simple" methods.

    Args:
      artist_name: A unicode string containing an artist's name
      whitelist: A whitelist dict that maps canonicalized names
        to names
      mappings: A mappings dict whose keys and values are both
        canonicalized artist names

    Returns:
      A string containing the standardized form of the artist name,
      or None if the name is not recognized.
    """
    canon_name = similarity.canonicalize_string(artist_name)
    # We just try to look up the canonicalized form of the artist name
    # in both the whitelist and mapping dicts.
    if canon_name in whitelist:
        return whitelist[canon_name]
    elif canon_name in mappings:
        return whitelist.get(mappings[canon_name])
    else:
        return None
Example #7
0
def suggest(name):
    canon_name = similarity.canonicalize_string(name)
    _global_lock.acquire()
    try:
        canon_whitelist = list(_global_whitelist)
    finally:
        _global_lock.release()
    best_guess = None
    # We ignore any items that are more than 10 edits away from our
    # original name.
    MAX_DIST = 10
    MAX_NORM_DIST = 0.25
    best_dist = 1e+100
    for guess in canon_whitelist:
        normalizer = (len(guess)+len(canon_name)/2.0)
        max_value = min(MAX_DIST, int(1+normalizer*MAX_NORM_DIST))
        lev_dist = similarity.get_levenshtein_distance(
            canon_name, guess, max_value=max_value)
        if lev_dist < MAX_DIST:
            normalized_lev_dist = lev_dist / normalizer
            if normalized_lev_dist < MAX_NORM_DIST:
                best_guess = guess
                best_dist = normalized_lev_dist
    return _global_whitelist.get(best_guess)