예제 #1
def _read_artist_mappings_from_file(file_obj):
    """Read the artist mappings from a file.

      file_obj: A file object to read the mappings from

      A 2-tuple containing
        * A mapping dict, which carries canonicalized strings to
          canonicalized strings
        * A "raw mapping dict, which gives the mapping exactly as
          described in the file, without any canonicalization.
    raw_mappings = {}
    mappings = {}
    for line in file_obj:
        line = line.strip()
        if not line or line.startswith("#"):
        before, sep, after = line.partition(_MAPPINGS_SEP)
        if not sep:
            sys.stderr.write("Skipping invalid mapping: \"%s\"\n"
                             % line.encode("utf-8"))
        before = before.strip()
        after = after.strip()
        raw_mappings[before] = after

        canon_before = similarity.canonicalize_string(before)
        canon_after = similarity.canonicalize_string(after)
        mappings[canon_before] = canon_after

    return mappings, raw_mappings
예제 #2
def merge_whitelist_and_mappings(whitelist, raw_mappings):
    """Combine information from whitelist and mappings.

      whitelist: A whitelist dict
      raw_mappings: A raw mappings dict
      A (whitelist, raw_mapping) pair that is equivalent to the args
      but with certain normalizations applied that take information
      from the mappings and applies it back to the whitelist, thereby
      correcting any inconsistencies.
    new_whitelist = dict(whitelist)
    inv_whitelist = dict((v, k) for k, v in whitelist.iteritems())
    new_raw_mappings = {}
    for before, after in raw_mappings.iteritems():
        std_before = _standardize(before, whitelist, {})
        std_after = _standardize(after, whitelist, {})
        # Every "after" should exactly match a whitelist item.
        if after != std_after:
            if std_after is not None:
                # Delete the whitelist entry that created the non-matching
                # standardization of after.
                    del new_whitelist[inv_whitelist[std_after]]
                except KeyError:
            # A "before" item in the mappings should never exactly match
            # an existing whitelist entry.  If it does, delete it from
            # the whitelist.
            canon_before = similarity.canonicalize_string(before)
            if canon_before in new_whitelist:
                del new_whitelist[canon_before]
            # Insert the "after" form into the new whitelist.
            canon_after = similarity.canonicalize_string(after)
            new_whitelist[canon_after] = after
        # If we can figure out a mapping based solely on the whitelist,
        # the mapping can be dropped.
        if std_before and std_before == std_after:
        new_raw_mappings[before] = after

    # If the whitelist and mappings remained stable under these operations,
    # return them.
    if new_whitelist == whitelist and new_raw_mappings == raw_mappings:
        return new_whitelist, new_raw_mappings
    # If something did change, call self recursively on the results.
    return merge_whitelist_and_mappings(new_whitelist, new_raw_mappings)
예제 #3
def _seq_to_whitelist(seq_of_names):
    new_whitelist = {}
    for name in seq_of_names:
        canon = similarity.canonicalize_string(name)
        if canon in new_whitelist:
            sys.stderr.write("Artist whitelist collision: \"%s\" and \"%s\"\n"
                             % (new_whitelist[canon], name))
            return None
        new_whitelist[canon] = name
    return new_whitelist
예제 #6
def _standardize_simple(artist_name, whitelist, mappings):
    """Attempt to standardize an artist name using only "simple" methods.

      artist_name: A unicode string containing an artist's name
      whitelist: A whitelist dict that maps canonicalized names
        to names
      mappings: A mappings dict whose keys and values are both
        canonicalized artist names

      A string containing the standardized form of the artist name,
      or None if the name is not recognized.
    canon_name = similarity.canonicalize_string(artist_name)
    # We just try to look up the canonicalized form of the artist name
    # in both the whitelist and mapping dicts.
    if canon_name in whitelist:
        return whitelist[canon_name]
    elif canon_name in mappings:
        return whitelist.get(mappings[canon_name])
        return None
예제 #7
def suggest(name):
    canon_name = similarity.canonicalize_string(name)
        canon_whitelist = list(_global_whitelist)
    best_guess = None
    # We ignore any items that are more than 10 edits away from our
    # original name.
    MAX_DIST = 10
    MAX_NORM_DIST = 0.25
    best_dist = 1e+100
    for guess in canon_whitelist:
        normalizer = (len(guess)+len(canon_name)/2.0)
        max_value = min(MAX_DIST, int(1+normalizer*MAX_NORM_DIST))
        lev_dist = similarity.get_levenshtein_distance(
            canon_name, guess, max_value=max_value)
        if lev_dist < MAX_DIST:
            normalized_lev_dist = lev_dist / normalizer
            if normalized_lev_dist < MAX_NORM_DIST:
                best_guess = guess
                best_dist = normalized_lev_dist
    return _global_whitelist.get(best_guess)